diff --git a/src/KSPACE/pppm_cg.cpp b/src/KSPACE/pppm_cg.cpp
index 499b55c08f..93d93c48c3 100644
--- a/src/KSPACE/pppm_cg.cpp
+++ b/src/KSPACE/pppm_cg.cpp
@@ -19,8 +19,10 @@
 #include "mpi.h"
 #include "math.h"
 #include "stdlib.h"
+#include "string.h"
 
 #include "atom.h"
+#include "commgrid.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
@@ -34,7 +36,11 @@ using namespace MathConst;
 
 #define OFFSET 16384
 #define SMALLQ 0.00001
-#if defined(FFT_SINGLE)
+
+enum{REVERSE_RHO};
+enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
+
+#ifdef FFT_SINGLE
 #define ZEROF 0.0f
 #else
 #define ZEROF 0.0
@@ -42,7 +48,7 @@ using namespace MathConst;
 
 /* ---------------------------------------------------------------------- */
 
-PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPMOld(lmp, narg, arg)
+PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
 {
   if ((narg < 1) || (narg > 2))
     error->all(FLERR,"Illegal kspace_style pppm/cg command");
@@ -54,6 +60,7 @@ PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPMOld(lmp, narg, arg)
 
   num_charged = -1;
   is_charged = NULL;
+  group_group_enable = 1;
 }
 
 /* ----------------------------------------------------------------------
@@ -71,7 +78,6 @@ PPPMCG::~PPPMCG()
 
 void PPPMCG::compute(int eflag, int vflag)
 {
-  int i,j;
 
   // set energy/virial flags
   // invoke allocate_peratom() if needed for first time
@@ -82,6 +88,8 @@ void PPPMCG::compute(int eflag, int vflag)
 
   if (evflag_atom && !peratom_allocate_flag) {
     allocate_peratom();
+    cg_peratom->ghost_notify();
+    cg_peratom->setup();
     peratom_allocate_flag = 1;
   }
 
@@ -110,7 +118,7 @@ void PPPMCG::compute(int eflag, int vflag)
     double charged_frac, charged_fmax, charged_fmin;
 
     num_charged=0;
-    for (i=0; i < atom->nlocal; ++i)
+    for (int i=0; i < atom->nlocal; ++i)
       if (fabs(atom->q[i]) > smallq)
         ++num_charged;
 
@@ -148,7 +156,7 @@ void PPPMCG::compute(int eflag, int vflag)
   }
 
   num_charged = 0;
-  for (i = 0; i < atom->nlocal; ++i)
+  for (int i = 0; i < atom->nlocal; ++i)
     if (fabs(atom->q[i]) > smallq) {
       is_charged[num_charged] = i;
       ++num_charged;
@@ -164,6 +172,7 @@ void PPPMCG::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
+  cg->reverse_comm(this,REVERSE_RHO);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -176,11 +185,17 @@ void PPPMCG::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  fillbrick();
+  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
+  else cg->forward_comm(this,FORWARD_IK);
 
   // extra per-atom energy/virial communication
 
-  if (evflag_atom) fillbrick_peratom();
+  if (evflag_atom) {
+    if (differentiation_flag == 1 && vflag_atom) 
+      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+    else if (differentiation_flag == 0)
+      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+  }
 
   // calculate the force on my particles
 
@@ -210,19 +225,18 @@ void PPPMCG::compute(int eflag, int vflag)
   if (vflag_global) {
     double virial_all[6];
     MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
-    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+    for (int i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
   }
 
   // per-atom energy/virial
   // energy includes self-energy correction
 
   if (evflag_atom) {
-    double *q = atom->q;
-    int nlocal = atom->nlocal;
+    const double * const q = atom->q;
 
     if (eflag_atom) {
       for (int j = 0; j < num_charged; j++) {
-        int i = is_charged[j];
+        const int i = is_charged[j];
         eatom[i] *= 0.5;
         eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
           (g_ewald*g_ewald*volume);
@@ -231,16 +245,16 @@ void PPPMCG::compute(int eflag, int vflag)
     }
 
     if (vflag_atom) {
-      for (int j = 0; j < num_charged; j++) {
-        int i = is_charged[j];
-        for (int n = 0; n < 6; n++) vatom[i][n] *= 0.5*q[i]*qscale;
+      for (int n = 0; n < num_charged; n++) {
+        const int i = is_charged[n];
+        for (int j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
       }
     }
   }
 
   // 2d slab correction
 
-  if (slabflag) slabcorr();
+  if (slabflag == 1) slabcorr();
 
   // convert atoms back from lamda to box coords
 
@@ -279,7 +293,8 @@ void PPPMCG::particle_map()
 
     if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
         ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-        nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag = 1;
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+      flag = 1;
   }
 
   if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
@@ -299,8 +314,8 @@ void PPPMCG::make_rho()
 
   // clear 3d density array
 
-  FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out];
-  for (i = 0; i < ngrid; i++) vec[i] = ZEROF;
+  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
 
   // loop over my charges, add their contribution to nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -311,7 +326,7 @@ void PPPMCG::make_rho()
   double **x = atom->x;
 
   for (int j = 0; j < num_charged; j++) {
-    int i = is_charged[j];
+    i = is_charged[j];
 
     nx = part2grid[i][0];
     ny = part2grid[i][1];
@@ -337,12 +352,11 @@ void PPPMCG::make_rho()
     }
   }
 }
-
 /* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
+   interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
 
-void PPPMCG::fieldforce()
+void PPPMCG::fieldforce_ik()
 {
   int i,l,m,n,nx,ny,nz,mx,my,mz;
   FFT_SCALAR dx,dy,dz,x0,y0,z0;
@@ -392,74 +406,162 @@ void PPPMCG::fieldforce()
     const double qfactor = force->qqrd2e * scale * q[i];
     f[i][0] += qfactor*ekx;
     f[i][1] += qfactor*eky;
-    f[i][2] += qfactor*ekz;
+    if (slabflag != 2) f[i][2] += qfactor*ekz;
   }
 }
 
 /* ----------------------------------------------------------------------
- interpolate from grid to get per-atom energy/virial
- ------------------------------------------------------------------------- */
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPMCG::fieldforce_ad()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR ekx,eky,ekz;
+  double s1,s2,s3;
+  double sf = 0.0;
+  double *prd;
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+
+  double hx_inv = nx_pppm/xprd;
+  double hy_inv = ny_pppm/yprd;
+  double hz_inv = nz_pppm/zprd;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double **f = atom->f;
+
+  for (int j = 0; j < num_charged; j++) {
+    i = is_charged[j];
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+    compute_drho1d(dx,dy,dz);
+
+    ekx = eky = ekz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx];
+        }
+      }
+    }
+    ekx *= hx_inv;
+    eky *= hy_inv;
+    ekz *= hz_inv;
+
+    // convert E-field to force and substract self forces
+
+    const double qfactor = force->qqrd2e * scale;
+
+    s1 = x[i][0]*hx_inv;
+    s2 = x[i][1]*hy_inv;
+    s3 = x[i][2]*hz_inv;
+    sf = sf_coeff[0]*sin(2*MY_PI*s1);
+    sf += sf_coeff[1]*sin(4*MY_PI*s1);
+    sf *= 2*q[i]*q[i];
+    f[i][0] += qfactor*(ekx*q[i] - sf);
+
+    sf = sf_coeff[2]*sin(2*MY_PI*s2);
+    sf += sf_coeff[3]*sin(4*MY_PI*s2);
+    sf *= 2*q[i]*q[i];
+    f[i][1] += qfactor*(eky*q[i] - sf);
+
+
+    sf = sf_coeff[4]*sin(2*MY_PI*s3);
+    sf += sf_coeff[5]*sin(4*MY_PI*s3);
+    sf *= 2*q[i]*q[i];
+    if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
 
 void PPPMCG::fieldforce_peratom()
 {
-    int i,l,m,n,nx,ny,nz,mx,my,mz;
-    FFT_SCALAR dx,dy,dz,x0,y0,z0;
-    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
 
-    // loop over my charges, interpolate from nearby grid points
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // (dx,dy,dz) = distance to "lower left" grid pt
-    // (mx,my,mz) = global coords of moving stencil pt
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
 
-    double *q = atom->q;
-    double **x = atom->x;
-    double **f = atom->f;
+  double *q = atom->q;
+  double **x = atom->x;
 
-    for (int j = 0; j < num_charged; j++) {
-        i = is_charged[j];
+  for (int j = 0; j < num_charged; j++) {
+    i = is_charged[j];
 
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
 
-        compute_rho1d(dx,dy,dz);
+    compute_rho1d(dx,dy,dz);
 
-        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
-        for (n = nlower; n <= nupper; n++) {
-            mz = n+nz;
-            z0 = rho1d[2][n];
-            for (m = nlower; m <= nupper; m++) {
-                my = m+ny;
-                y0 = z0*rho1d[1][m];
-                for (l = nlower; l <= nupper; l++) {
-                    mx = l+nx;
-                    x0 = y0*rho1d[0][l];
-                    if (eflag_atom) u += x0*u_brick[mz][my][mx];
-                    if (vflag_atom) {
-                        v0 += x0*v0_brick[mz][my][mx];
-                        v1 += x0*v1_brick[mz][my][mx];
-                        v2 += x0*v2_brick[mz][my][mx];
-                        v3 += x0*v3_brick[mz][my][mx];
-                        v4 += x0*v4_brick[mz][my][mx];
-                        v5 += x0*v5_brick[mz][my][mx];
-                    }
-                }
-            }
-        }
-
-        if (eflag_atom) eatom[i] += q[i]*u;
-        if (vflag_atom) {
-            vatom[i][0] += v0;
-            vatom[i][1] += v1;
-            vatom[i][2] += v2;
-            vatom[i][3] += v3;
-            vatom[i][4] += v4;
-            vatom[i][5] += v5;
+    u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          if (eflag_atom) u += x0*u_brick[mz][my][mx];
+          if (vflag_atom) {
+            v0 += x0*v0_brick[mz][my][mx];
+            v1 += x0*v1_brick[mz][my][mx];
+            v2 += x0*v2_brick[mz][my][mx];
+            v3 += x0*v3_brick[mz][my][mx];
+            v4 += x0*v4_brick[mz][my][mx];
+            v5 += x0*v5_brick[mz][my][mx];
+          }
         }
+      }
     }
+
+    if (eflag_atom) eatom[i] += q[i]*u;
+    if (vflag_atom) {
+      vatom[i][0] += q[i]*v0;
+      vatom[i][1] += q[i]*v1;
+      vatom[i][2] += q[i]*v2;
+      vatom[i][3] += q[i]*v3;
+      vatom[i][4] += q[i]*v4;
+      vatom[i][5] += q[i]*v5;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -471,14 +573,17 @@ void PPPMCG::fieldforce_peratom()
 
 void PPPMCG::slabcorr()
 {
+  int i,j;
+
   // compute local contribution to global dipole moment
 
-  double *q = atom->q;
-  double **x = atom->x;
-
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
   double dipole = 0.0;
-  for (int j = 0; j < num_charged; j++) {
-    int i = is_charged[j];
+
+
+  for (j = 0; j < num_charged; j++) {
+    i = is_charged[j];
     dipole += q[i]*x[i][2];
   }
 
@@ -489,29 +594,101 @@ void PPPMCG::slabcorr()
 
   // compute corrections
 
-  const double e_slabcorr = 2.0*MY_PI*dipole_all*dipole_all/volume;
+  const double e_slabcorr = MY_2PI*dipole_all*dipole_all/volume;
   const double qscale = force->qqrd2e * scale;
 
   if (eflag_global) energy += qscale * e_slabcorr;
 
-  //per-atom energy
+  // per-atom energy
 
   if (eflag_atom) {
-    double efact = 2.0*MY_PI*dipole_all/volume;
-    for (int j = 0; j < num_charged; j++) {
-      int i = is_charged[j];
+    const double efact = MY_2PI*dipole_all/volume;
+    for (j = 0; j < num_charged; j++) {
+      i = is_charged[j];
       eatom[i] += qscale * q[i]*x[i][2]*efact;
     }
   }
 
   // add on force corrections
 
-  const double ffact = -4.0*MY_PI*dipole_all/volume * qscale;
-  double **f = atom->f;
+  const double ffact = -MY_4PI*dipole_all/volume;
+  double * const * const f = atom->f;
+
+  for (j = 0; j < num_charged; j++) {
+    i = is_charged[j];
+    f[i][2] += qscale * q[i]*ffact;
+  }
+}
+
+/* ----------------------------------------------------------------------
+ create discretized "density" on section of global grid due to my particles
+ density(x,y,z) = charge "density" at grid points of my 3d brick
+ (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+ in global grid for group-group interactions
+ ------------------------------------------------------------------------- */
+
+void PPPMCG::make_rho_groups(int groupbit_A, int groupbit_B, int BA_flag)
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+  // clear 3d density arrays
+
+  memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int * const mask = atom->mask;
 
   for (int j = 0; j < num_charged; j++) {
-    int i = is_charged[j];
-    f[i][2] += q[i]*ffact;
+    i = is_charged[j];
+
+    if ((mask[i] & groupbit_A) && (mask[i] & groupbit_B))
+      if (BA_flag) continue;
+
+    if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) {
+
+      nx = part2grid[i][0];
+      ny = part2grid[i][1];
+      nz = part2grid[i][2];
+      dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+      dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+      dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+      compute_rho1d(dx,dy,dz);
+
+      z0 = delvolinv * q[i];
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        y0 = z0*rho1d[2][n];
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          x0 = y0*rho1d[1][m];
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+
+            // group A
+
+            if (mask[i] & groupbit_A)
+              density_A_brick[mz][my][mx] += x0*rho1d[0][l];
+
+            // group B
+
+            if (mask[i] & groupbit_B)
+              density_B_brick[mz][my][mx] += x0*rho1d[0][l];
+          }
+        }
+      }
+    }
   }
 }
 
@@ -521,7 +698,7 @@ void PPPMCG::slabcorr()
 
 double PPPMCG::memory_usage()
 {
-  double bytes = PPPMOld::memory_usage();
+  double bytes = PPPM::memory_usage();
   bytes += nmax * sizeof(int);
   return bytes;
 }
diff --git a/src/KSPACE/pppm_cg.h b/src/KSPACE/pppm_cg.h
index 6215d13249..3fee5fb1e0 100644
--- a/src/KSPACE/pppm_cg.h
+++ b/src/KSPACE/pppm_cg.h
@@ -20,11 +20,11 @@ KSpaceStyle(pppm/cg,PPPMCG)
 #ifndef LMP_PPPM_CG_H
 #define LMP_PPPM_CG_H
 
-#include "pppm_old.h"
+#include "pppm.h"
 
 namespace LAMMPS_NS {
 
-class PPPMCG : public PPPMOld {
+class PPPMCG : public PPPM {
  public:
   PPPMCG(class LAMMPS *, int, char **);
   virtual ~PPPMCG();
@@ -38,9 +38,11 @@ class PPPMCG : public PPPMOld {
 
   virtual void particle_map();
   virtual void make_rho();
-  virtual void fieldforce();
+  virtual void fieldforce_ik();
+  virtual void fieldforce_ad();
   virtual void fieldforce_peratom();
   virtual void slabcorr();
+  virtual void make_rho_groups(int, int, int);
 };
 
 }
diff --git a/src/KSPACE/pppm_tip4p.cpp b/src/KSPACE/pppm_tip4p.cpp
index 4abd241006..dd2f3da0cc 100644
--- a/src/KSPACE/pppm_tip4p.cpp
+++ b/src/KSPACE/pppm_tip4p.cpp
@@ -226,7 +226,7 @@ void PPPMTIP4P::fieldforce_ik()
     if (type[i] != typeO) {
       f[i][0] += qfactor*ekx;
       f[i][1] += qfactor*eky;
-      f[i][2] += qfactor*ekz;
+      if (slabflag != 2) f[i][2] += qfactor*ekz;
 
     } else {
       fx = qfactor * ekx;
@@ -236,15 +236,15 @@ void PPPMTIP4P::fieldforce_ik()
 
       f[i][0] += fx*(1 - alpha);
       f[i][1] += fy*(1 - alpha);
-      f[i][2] += fz*(1 - alpha);
+      if (slabflag != 2) f[i][2] += fz*(1 - alpha);
 
       f[iH1][0] += 0.5*alpha*fx;
       f[iH1][1] += 0.5*alpha*fy;
-      f[iH1][2] += 0.5*alpha*fz;
+      if (slabflag != 2) f[iH1][2] += 0.5*alpha*fz;
 
       f[iH2][0] += 0.5*alpha*fx;
       f[iH2][1] += 0.5*alpha*fy;
-      f[iH2][2] += 0.5*alpha*fz;
+      if (slabflag != 2) f[iH2][2] += 0.5*alpha*fz;
     }
   }
 }
@@ -256,7 +256,7 @@ void PPPMTIP4P::fieldforce_ik()
 void PPPMTIP4P::fieldforce_ad()
 {
   int i,l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0,dx0,dy0,dz0;
+  FFT_SCALAR dx,dy,dz;
   FFT_SCALAR ekx,eky,ekz;
   double *xi;
   int iH1,iH2;
@@ -272,13 +272,10 @@ void PPPMTIP4P::fieldforce_ad()
   double xprd = prd[0];
   double yprd = prd[1];
   double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
 
   double hx_inv = nx_pppm/xprd;
   double hy_inv = ny_pppm/yprd;
   double hz_inv = nz_pppm/zprd;
-  
-
 
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -302,9 +299,9 @@ void PPPMTIP4P::fieldforce_ad()
     nx = part2grid[i][0];
     ny = part2grid[i][1];
     nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+    dx = nx+shiftone - (xi[0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (xi[1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (xi[2]-boxlo[2])*delzinv;
     
     compute_rho1d(dx,dy,dz);
     compute_drho1d(dx,dy,dz);
@@ -335,38 +332,38 @@ void PPPMTIP4P::fieldforce_ad()
     s3 = x[i][2]*hz_inv;
     sf = sf_coeff[0]*sin(2*MY_PI*s1);
     sf += sf_coeff[1]*sin(4*MY_PI*s1);
-    sf *= 2*q[i]*q[i];
+    sf *= 2.0*q[i]*q[i];
     fx = qfactor*(ekx*q[i] - sf);
 
     sf = sf_coeff[2]*sin(2*MY_PI*s2);
     sf += sf_coeff[3]*sin(4*MY_PI*s2);
-    sf *= 2*q[i]*q[i];
+    sf *= 2.0*q[i]*q[i];
     fy = qfactor*(eky*q[i] - sf);
 
     sf = sf_coeff[4]*sin(2*MY_PI*s3);
     sf += sf_coeff[5]*sin(4*MY_PI*s3);
-    sf *= 2*q[i]*q[i];
+    sf *= 2.0*q[i]*q[i];
     fz = qfactor*(ekz*q[i] - sf);
 
     if (type[i] != typeO) {
       f[i][0] += fx;
       f[i][1] += fy;
-      f[i][2] += fz;
+      if (slabflag != 2) f[i][2] += fz;
 
     } else {
       find_M(i,iH1,iH2,xM);
 
       f[i][0] += fx*(1 - alpha);
       f[i][1] += fy*(1 - alpha);
-      f[i][2] += fz*(1 - alpha);
+      if (slabflag != 2) f[i][2] += fz*(1 - alpha);
 
       f[iH1][0] += 0.5*alpha*fx;
       f[iH1][1] += 0.5*alpha*fy;
-      f[iH1][2] += 0.5*alpha*fz;
+      if (slabflag != 2) f[iH1][2] += 0.5*alpha*fz;
 
       f[iH2][0] += 0.5*alpha*fx;
       f[iH2][1] += 0.5*alpha*fy;
-      f[iH2][2] += 0.5*alpha*fz;
+      if (slabflag != 2) f[iH2][2] += 0.5*alpha*fz;
     }
   }
 }
@@ -392,7 +389,6 @@ void PPPMTIP4P::fieldforce_peratom()
   // ek = 3 components of E-field on particle
   double *q = atom->q;
   double **x = atom->x;
-  double **f = atom->f;
 
   int *type = atom->type;
   int nlocal = atom->nlocal;
diff --git a/src/USER-OMP/pppm_cg_omp.cpp b/src/USER-OMP/pppm_cg_omp.cpp
index 51fc048b3b..e65a9265e5 100644
--- a/src/USER-OMP/pppm_cg_omp.cpp
+++ b/src/USER-OMP/pppm_cg_omp.cpp
@@ -19,9 +19,12 @@
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
+#include "error.h"
+#include "fix_omp.h"
 #include "force.h"
 #include "memory.h"
 #include "math_const.h"
+#include "math_special.h"
 
 #include <string.h>
 #include <math.h>
@@ -29,12 +32,14 @@
 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using namespace MathSpecial;
 
-#define SMALLQ 0.00001
-#if defined(FFT_SINGLE)
+#ifdef FFT_SINGLE
 #define ZEROF 0.0f
+#define ONEF  1.0f
 #else
 #define ZEROF 0.0
+#define ONEF  1.0
 #endif
 
 #define EPS_HOC 1.0e-7
@@ -55,7 +60,27 @@ void PPPMCGOMP::allocate()
 {
   PPPMCG::allocate();
 
-  const int nthreads = comm->nthreads;
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    thr->init_pppm(order,memory);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   free memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::deallocate()
+{
+  PPPMCG::deallocate();
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
@@ -66,153 +91,26 @@ void PPPMCGOMP::allocate()
 #else
     const int tid = 0;
 #endif
-
-    FFT_SCALAR **rho1d_thr;
-    memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr");
     ThrData *thr = fix->get_thr(tid);
-    thr->init_pppm(static_cast<void *>(rho1d_thr));
-  }
-
-  const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1;
-
-  // reallocate density brick, so it fits our needs
-  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_brick");
-}
-
-// NOTE: special version of reduce_data for FFT_SCALAR data type.
-// reduce per thread data into the first part of the data
-// array that is used for the non-threaded parts and reset
-// the temporary storage to 0.0. this routine depends on
-// multi-dimensional arrays like force stored in this order
-// x1,y1,z1,x2,y2,z2,...
-// we need to post a barrier to wait until all threads are done
-// writing to the array.
-static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
-{
-#if defined(_OPENMP)
-  // NOOP in non-threaded execution.
-  if (nthreads == 1) return;
-#pragma omp barrier
-  {
-    const int nvals = ndim*nall;
-    const int idelta = nvals/nthreads + 1;
-    const int ifrom = tid*idelta;
-    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
-
-    // this if protects against having more threads than atoms
-    if (ifrom < nall) {
-      for (int m = ifrom; m < ito; ++m) {
-        for (int n = 1; n < nthreads; ++n) {
-          dall[m] += dall[n*nvals + m];
-          dall[n*nvals + m] = 0.0;
-        }
-      }
-    }
-  }
-#else
-  // NOOP in non-threaded execution.
-  return;
-#endif
-}
-
-/* ----------------------------------------------------------------------
-   free memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPMCGOMP::deallocate()
-{
-  PPPMCG::deallocate();
-  for (int i=0; i < comm->nthreads; ++i) {
-    ThrData * thr = fix->get_thr(i);
-    FFT_SCALAR ** rho1d_thr = static_cast<FFT_SCALAR **>(thr->get_rho1d());
-    memory->destroy2d_offset(rho1d_thr,-order/2);
+    thr->init_pppm(-order,memory);
   }
 }
 
 /* ----------------------------------------------------------------------
-   adjust PPPMCG coeffs, called initially and whenever volume has changed
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
 ------------------------------------------------------------------------- */
 
-void PPPMCGOMP::setup()
+void PPPMCGOMP::compute_gf_ik()
 {
-  int i,j,k,n;
-  double *prd;
-
-  // volume-dependent factors
-  // adjust z dimension for 2d slab PPPM
-  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
-
-  if (triclinic == 0) prd = domain->prd;
-  else prd = domain->prd_lamda;
+  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
 
   const double xprd = prd[0];
   const double yprd = prd[1];
   const double zprd = prd[2];
   const double zprd_slab = zprd*slab_volfactor;
-  volume = xprd * yprd * zprd_slab;
-
-  delxinv = nx_pppm/xprd;
-  delyinv = ny_pppm/yprd;
-  delzinv = nz_pppm/zprd_slab;
-
-  delvolinv = delxinv*delyinv*delzinv;
-
-  const double unitkx = (2.0*MY_PI/xprd);
-  const double unitky = (2.0*MY_PI/yprd);
-  const double unitkz = (2.0*MY_PI/zprd_slab);
-
-  // fkx,fky,fkz for my FFT grid pts
-
-  double per;
-
-  for (i = nxlo_fft; i <= nxhi_fft; i++) {
-    per = i - nx_pppm*(2*i/nx_pppm);
-    fkx[i] = unitkx*per;
-  }
-
-  for (i = nylo_fft; i <= nyhi_fft; i++) {
-    per = i - ny_pppm*(2*i/ny_pppm);
-    fky[i] = unitky*per;
-  }
-
-  for (i = nzlo_fft; i <= nzhi_fft; i++) {
-    per = i - nz_pppm*(2*i/nz_pppm);
-    fkz[i] = unitkz*per;
-  }
-
-  // virial coefficients
-
-  double sqk,vterm;
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++) {
-    for (j = nylo_fft; j <= nyhi_fft; j++) {
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
-        if (sqk == 0.0) {
-          vg[n][0] = 0.0;
-          vg[n][1] = 0.0;
-          vg[n][2] = 0.0;
-          vg[n][3] = 0.0;
-          vg[n][4] = 0.0;
-          vg[n][5] = 0.0;
-        } else {
-          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
-          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
-          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
-          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
-          vg[n][3] = vterm*fkx[i]*fky[j];
-          vg[n][4] = vterm*fkx[i]*fkz[k];
-          vg[n][5] = vterm*fky[j]*fkz[k];
-        }
-        n++;
-      }
-    }
-  }
-
-  // modified (Hockney-Eastwood) Coulomb Green's function
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
 
   const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
                                     pow(-log(EPS_HOC),0.25));
@@ -220,93 +118,188 @@ void PPPMCGOMP::setup()
                                     pow(-log(EPS_HOC),0.25));
   const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
                                     pow(-log(EPS_HOC),0.25));
+  const int numk = nxhi_fft - nxlo_fft + 1;
+  const int numl = nyhi_fft - nylo_fft + 1;
 
-  const double form = 1.0;
+  const int twoorder = 2*order;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
-    int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m;
-    double snx,sny,snz,sqk;
+    double snx,sny,snz;
     double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
     double sum1,dot1,dot2;
     double numerator,denominator;
-    const double gew2 = -4.0*g_ewald*g_ewald;
+    double sqk;
 
-    const int nnx = nxhi_fft-nxlo_fft+1;
-    const int nny = nyhi_fft-nylo_fft+1;
+    int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;
 
-    loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);
+    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
 
-    for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    for (n = nfrom; n < nto; ++n) {
+      m = n / (numl*numk);
+      l = (n - m*numl*numk) / numk;
+      k = n - m*numl*numk - l*numk;
+      m += nzlo_fft;
+      l += nylo_fft;
+      k += nxlo_fft;
 
-      const double fkzm = fkz[m];
-      snz = sin(0.5*fkzm*zprd_slab/nz_pppm);
-      snz *= snz;
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
 
-      for (l = nylo_fft; l <= nyhi_fft; l++) {
-        const double fkyl = fky[l];
-        sny = sin(0.5*fkyl*yprd/ny_pppm);
-        sny *= sny;
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
 
-        for (k = nxlo_fft; k <= nxhi_fft; k++) {
+      kper = k - nx_pppm*(2*k/nx_pppm);
+      snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
 
-          /* only compute the part designated to this thread */
-          nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
-          if ((nn < nnfrom) || (nn >=nnto)) continue;
+      sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
 
-          const double fkxk = fkx[k];
-          snx = sin(0.5*fkxk*xprd/nx_pppm);
-          snx *= snx;
+      if (sqk != 0.0) {
+	numerator = 12.5663706/sqk;
+	denominator = gf_denom(snx,sny,snz);
+	sum1 = 0.0;
 
-          sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm;
+	for (nx = -nbx; nx <= nbx; nx++) {
+	  qx = unitkx*(kper+nx_pppm*nx);
+	  sx = exp(-0.25*square(qx/g_ewald));
+	  argx = 0.5*qx*xprd/nx_pppm;
+	  wx = powsinxx(argx,twoorder);
 
-          if (sqk != 0.0) {
-            numerator = form*MY_4PI/sqk;
-            denominator = gf_denom(snx,sny,snz);
-            const double dorder = static_cast<double>(order);
-            sum1 = 0.0;
-            for (nx = -nbx; nx <= nbx; nx++) {
-              qx = fkxk + unitkx*nx_pppm*nx;
-              sx = exp(qx*qx/gew2);
-              wx = 1.0;
-              argx = 0.5*qx*xprd/nx_pppm;
-              if (argx != 0.0) wx = pow(sin(argx)/argx,dorder);
-              wx *=wx;
+	  for (ny = -nby; ny <= nby; ny++) {
+	    qy = unitky*(lper+ny_pppm*ny);
+	    sy = exp(-0.25*square(qy/g_ewald));
+	    argy = 0.5*qy*yprd/ny_pppm;
+	    wy = powsinxx(argy,twoorder);
 
-              for (ny = -nby; ny <= nby; ny++) {
-                qy = fkyl + unitky*ny_pppm*ny;
-                sy = exp(qy*qy/gew2);
-                wy = 1.0;
-                argy = 0.5*qy*yprd/ny_pppm;
-                if (argy != 0.0) wy = pow(sin(argy)/argy,dorder);
-                wy *= wy;
+	    for (nz = -nbz; nz <= nbz; nz++) {
+	      qz = unitkz*(mper+nz_pppm*nz);
+	      sz = exp(-0.25*square(qz/g_ewald));
+	      argz = 0.5*qz*zprd_slab/nz_pppm;
+	      wz = powsinxx(argz,twoorder);
 
-                for (nz = -nbz; nz <= nbz; nz++) {
-                  qz = fkzm + unitkz*nz_pppm*nz;
-                  sz = exp(qz*qz/gew2);
-                  wz = 1.0;
-                  argz = 0.5*qz*zprd_slab/nz_pppm;
-                  if (argz != 0.0) wz = pow(sin(argz)/argz,dorder);
-                  wz *= wz;
-
-                  dot1 = fkxk*qx + fkyl*qy + fkzm*qz;
-                  dot2 = qx*qx+qy*qy+qz*qz;
-                  sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
-                }
-              }
-            }
-            greensfn[nn] = numerator*sum1/denominator;
-          } else greensfn[nn] = 0.0;
-        }
-      }
+	      dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+	      dot2 = qx*qx+qy*qy+qz*qz;
+	      sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+	    }
+	  }
+	}
+	greensfn[n] = numerator*sum1/denominator;
+      } else greensfn[n] = 0.0;
     }
-  }
+  } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
-   run the regular toplevel compute method from plain PPPPMCG
+   compute optimized Green's function for energy calculation
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::compute_gf_ad()
+{
+
+  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  const int numk = nxhi_fft - nxlo_fft + 1;
+  const int numl = nyhi_fft - nylo_fft + 1;
+
+  const int twoorder = 2*order;
+  double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
+#endif
+  {
+    double snx,sny,snz,sqk;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double numerator,denominator;
+    int k,l,m,kper,lper,mper,n,nfrom,nto,tid;
+
+    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
+
+    for (n = nfrom; n < nto; ++n) {
+
+      m = n / (numl*numk);
+      l = (n - m*numl*numk) / numk;
+      k = n - m*numl*numk - l*numk;
+      m += nzlo_fft;
+      l += nylo_fft;
+      k += nxlo_fft;
+
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      qz = unitkz*mper;
+      snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
+      sz = exp(-0.25*square(qz/g_ewald));
+      argz = 0.5*qz*zprd_slab/nz_pppm;
+      wz = powsinxx(argz,twoorder);
+
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      qy = unitky*lper;
+      sny = square(sin(0.5*qy*yprd/ny_pppm));
+      sy = exp(-0.25*square(qy/g_ewald));
+      argy = 0.5*qy*yprd/ny_pppm;
+      wy = powsinxx(argy,twoorder);
+
+      kper = k - nx_pppm*(2*k/nx_pppm);
+      qx = unitkx*kper;
+      snx = square(sin(0.5*qx*xprd/nx_pppm));
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
+
+      sqk = qx*qx + qy*qy + qz*qz;
+
+      if (sqk != 0.0) {
+	numerator = MY_4PI/sqk;
+	denominator = gf_denom(snx,sny,snz);
+	greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
+	sf0 += sf_precoeff1[n]*greensfn[n];
+	sf1 += sf_precoeff2[n]*greensfn[n];
+	sf2 += sf_precoeff3[n]*greensfn[n];
+	sf3 += sf_precoeff4[n]*greensfn[n];
+	sf4 += sf_precoeff5[n]*greensfn[n];
+	sf5 += sf_precoeff6[n]*greensfn[n];
+      } else {
+	greensfn[n] = 0.0;
+	sf0 += sf_precoeff1[n]*greensfn[n];
+	sf1 += sf_precoeff2[n]*greensfn[n];
+	sf2 += sf_precoeff3[n]*greensfn[n];
+	sf3 += sf_precoeff4[n]*greensfn[n];
+	sf4 += sf_precoeff5[n]*greensfn[n];
+	sf5 += sf_precoeff6[n]*greensfn[n];
+      }
+    }
+  } // end of paralle region
+  
+  // compute the coefficients for the self-force correction
+
+  double prex, prey, prez, tmp[6];
+  prex = prey = prez = MY_PI/volume;
+  prex *= nx_pppm/xprd;
+  prey *= ny_pppm/yprd;
+  prez *= nz_pppm/zprd_slab;
+  tmp[0] = sf0 * prex;
+  tmp[1] = sf1 * prex*2;
+  tmp[2] = sf2 * prey;
+  tmp[3] = sf3 * prey*2;
+  tmp[4] = sf4 * prez;
+  tmp[5] = sf5 * prez*2;
+
+  // communicate values with other procs
+
+  MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
+}
+
+/* ----------------------------------------------------------------------
+   run the regular toplevel compute method from plain PPPMCG
    which will have individual methods replaced by our threaded
    versions and then call the obligatory force reduction.
 ------------------------------------------------------------------------- */
@@ -331,94 +324,10 @@ void PPPMCGOMP::compute(int eflag, int vflag)
 }
 
 /* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = charge "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid
+   interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
 
-void PPPMCGOMP::make_rho()
-{
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
-  const int nthreads = comm->nthreads;
-
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = num_charged;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int tid = 0;
-    const int ifrom = 0;
-    const int ito = num_charged;
-#endif
-
-    int i,j,l,m,n,nx,ny,nz,mx,my,mz;
-    FFT_SCALAR dx,dy,dz,x0,y0,z0;
-
-    // set up clear 3d density array
-    const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
-    FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
-    memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
-
-    ThrData *thr = fix->get_thr(tid);
-    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
-
-    // loop over my charges, add their contribution to nearby grid points
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // (dx,dy,dz) = distance to "lower left" grid pt
-    // (mx,my,mz) = global coords of moving stencil pt
-
-    // this if protects against having more threads than charged local atoms
-    if (ifrom < num_charged) {
-      for (j = ifrom; j < ito; j++) {
-        i = is_charged[j];
-
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-        compute_rho1d_thr(r1d,dx,dy,dz);
-
-        z0 = delvolinv * q[i];
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          y0 = z0*r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            x0 = y0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              db[mz][my][mx] += x0*r1d[0][l];
-            }
-          }
-        }
-      }
-    }
-#if defined(_OPENMP)
-    // reduce 3d density array
-    if (nthreads > 1) {
-      data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
-    }
-#endif
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
-------------------------------------------------------------------------- */
-
-void PPPMCGOMP::fieldforce()
+void PPPMCGOMP::fieldforce_ik()
 {
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -426,79 +335,161 @@ void PPPMCGOMP::fieldforce()
   // (mx,my,mz) = global coords of moving stencil pt
   // ek = 3 components of E-field on particle
 
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
-  const int nthreads = comm->nthreads;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
   const double qqrd2e = force->qqrd2e;
+  const int nthreads = comm->nthreads;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = num_charged;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int ifrom = 0;
-    const int ito = num_charged;
-    const int tid = 0;
-#endif
+    FFT_SCALAR dx,dy,dz,x0,y0,z0,ekx,eky,ekz;
+    int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
+
+    loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
+
+    // get per thread data
     ThrData *thr = fix->get_thr(tid);
-    double * const * const f = thr->get_f();
-    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
 
-    int i,j,l,m,n,nx,ny,nz,mx,my,mz;
-    FFT_SCALAR dx,dy,dz,x0,y0,z0;
-    FFT_SCALAR ekx,eky,ekz;
+    for (int j = ifrom; j < ito; ++j) {
+      i = is_charged[j];
 
-    // this if protects against having more threads than charged local atoms
-    if (ifrom < num_charged) {
-      for (j = ifrom; j < ito; j++) {
-        i = is_charged[j];
+      nx = part2grid[i][0];
+      ny = part2grid[i][1];
+      nz = part2grid[i][2];
+      dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
+      dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
+      dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
 
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+      compute_rho1d_thr(r1d,dx,dy,dz);
 
-        compute_rho1d_thr(r1d,dx,dy,dz);
-
-        ekx = eky = ekz = ZEROF;
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          z0 = r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            y0 = z0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              x0 = y0*r1d[0][l];
-              ekx -= x0*vdx_brick[mz][my][mx];
-              eky -= x0*vdy_brick[mz][my][mx];
-              ekz -= x0*vdz_brick[mz][my][mx];
-            }
-          }
-        }
-
-        // convert E-field to force
-        const double qfactor = qqrd2e*scale*q[i];
-        f[i][0] += qfactor*ekx;
-        f[i][1] += qfactor*eky;
-        f[i][2] += qfactor*ekz;
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+	mz = n+nz;
+	z0 = r1d[2][n];
+	for (m = nlower; m <= nupper; m++) {
+	  my = m+ny;
+	  y0 = z0*r1d[1][m];
+	  for (l = nlower; l <= nupper; l++) {
+	    mx = l+nx;
+	    x0 = y0*r1d[0][l];
+	    ekx -= x0*vdx_brick[mz][my][mx];
+	    eky -= x0*vdy_brick[mz][my][mx];
+	    ekz -= x0*vdz_brick[mz][my][mx];
+	  }
+	}
       }
+
+      // convert E-field to force
+
+      const double qfactor = qqrd2e * scale * q[i];
+      f[i].x += qfactor*ekx;
+      f[i].y += qfactor*eky;
+      if (slabflag != 2) f[i].z += qfactor*ekz;
     }
-  }
+  } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
- interpolate from grid to get per-atom energy/virial
- ------------------------------------------------------------------------- */
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::fieldforce_ad()
+{
+  const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
+  const double hx_inv = nx_pppm/prd[0];
+  const double hy_inv = ny_pppm/prd[1];
+  const double hz_inv = nz_pppm/prd[2];
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
+  const double qqrd2e = force->qqrd2e;
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,ekx,eky,ekz;
+    double s1,s2,s3,sf;
+
+    loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
+
+    for (int j = ifrom; j < ito; ++j) {
+      i = is_charged[j];
+
+      nx = part2grid[i][0];
+      ny = part2grid[i][1];
+      nz = part2grid[i][2];
+      dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
+      dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
+      dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
+
+      compute_rho1d_thr(r1d,dx,dy,dz);
+      compute_drho1d_thr(d1d,dx,dy,dz);
+
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+	mz = n+nz;
+	for (m = nlower; m <= nupper; m++) {
+	  my = m+ny;
+	  for (l = nlower; l <= nupper; l++) {
+	    mx = l+nx;
+	    ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+	    eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+	    ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
+	  }
+	}
+      }
+      ekx *= hx_inv;
+      eky *= hy_inv;
+      ekz *= hz_inv;
+
+      // convert E-field to force and substract self forces
+
+      const double qi = q[i];
+      const double qfactor = qqrd2e * scale * qi;
+
+      s1 = x[i].x*hx_inv;
+      sf = sf_coeff[0]*sin(MY_2PI*s1);
+      sf += sf_coeff[1]*sin(MY_4PI*s1);
+      sf *= 2.0*qi;
+      f[i].x += qfactor*(ekx - sf);
+
+      s2 = x[i].y*hy_inv;
+      sf = sf_coeff[2]*sin(MY_2PI*s2);
+      sf += sf_coeff[3]*sin(MY_4PI*s2);
+      sf *= 2*qi;
+      f[i].y += qfactor*(eky - sf);
+
+      s3 = x[i].z*hz_inv;
+      sf = sf_coeff[4]*sin(MY_2PI*s3);
+      sf += sf_coeff[5]*sin(MY_4PI*s3);
+      sf *= 2*qi;
+      if (slabflag != 2) f[i].z += qfactor*(ekz - sf);
+    }
+  } // end of parallel region
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
 
 void PPPMCGOMP::fieldforce_peratom()
 {
@@ -507,82 +498,71 @@ void PPPMCGOMP::fieldforce_peratom()
   // (dx,dy,dz) = distance to "lower left" grid pt
   // (mx,my,mz) = global coords of moving stencil pt
 
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
   const int nthreads = comm->nthreads;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = num_charged;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int ifrom = 0;
-    const int ito = num_charged;
-    const int tid = 0;
-#endif
-    ThrData *thr = fix->get_thr(tid);
-    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
-
-    int i,j,l,m,n,nx,ny,nz,mx,my,mz;
     FFT_SCALAR dx,dy,dz,x0,y0,z0;
     FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+    int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
 
-    // this if protects against having more threads than charged local atoms
-    if (ifrom < num_charged) {
-      for (j = ifrom; j < ito; j++) {
-        i = is_charged[j];
+    loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
 
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
 
-        compute_rho1d_thr(r1d,dx,dy,dz);
+    for (int j=ifrom; j < ito; ++j) {
+      i = is_charged[j];
 
-        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          z0 = r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            y0 = z0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              x0 = y0*r1d[0][l];
-              if (eflag_atom) u += x0*u_brick[mz][my][mx];
-              if (vflag_atom) {
-                v0 += x0*v0_brick[mz][my][mx];
-                v1 += x0*v1_brick[mz][my][mx];
-                v2 += x0*v2_brick[mz][my][mx];
-                v3 += x0*v3_brick[mz][my][mx];
-                v4 += x0*v4_brick[mz][my][mx];
-                v5 += x0*v5_brick[mz][my][mx];
-              }
-            }
-          }
-        }
+      nx = part2grid[i][0];
+      ny = part2grid[i][1];
+      nz = part2grid[i][2];
+      dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
+      dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
+      dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
 
-        if (eflag_atom) eatom[i] += q[i]*u;
-        if (vflag_atom) {
-          vatom[i][0] += v0;
-          vatom[i][1] += v1;
-          vatom[i][2] += v2;
-          vatom[i][3] += v3;
-          vatom[i][4] += v4;
-          vatom[i][5] += v5;
-        }
+      compute_rho1d_thr(r1d,dx,dy,dz);
+
+      u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+	mz = n+nz;
+	z0 = r1d[2][n];
+	for (m = nlower; m <= nupper; m++) {
+	  my = m+ny;
+	  y0 = z0*r1d[1][m];
+	  for (l = nlower; l <= nupper; l++) {
+	    mx = l+nx;
+	    x0 = y0*r1d[0][l];
+	    if (eflag_atom) u += x0*u_brick[mz][my][mx];
+	    if (vflag_atom) {
+	      v0 += x0*v0_brick[mz][my][mx];
+	      v1 += x0*v1_brick[mz][my][mx];
+	      v2 += x0*v2_brick[mz][my][mx];
+	      v3 += x0*v3_brick[mz][my][mx];
+	      v4 += x0*v4_brick[mz][my][mx];
+	      v5 += x0*v5_brick[mz][my][mx];
+	    }
+	  }
+	}
+      }
+
+      const double qi = q[i];
+      if (eflag_atom) eatom[i] += qi*u;
+      if (vflag_atom) {
+	vatom[i][0] += qi*v0;
+	vatom[i][1] += qi*v1;
+	vatom[i][2] += qi*v2;
+	vatom[i][3] += qi*v3;
+	vatom[i][4] += qi*v4;
+	vatom[i][5] += qi*v5;
       }
     }
-  }
+  } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
@@ -590,7 +570,7 @@ void PPPMCGOMP::fieldforce_peratom()
    dx,dy,dz = distance of particle from "lower left" grid point
 ------------------------------------------------------------------------- */
 void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
-                                  const FFT_SCALAR &dy, const FFT_SCALAR &dz)
+                                const FFT_SCALAR &dy, const FFT_SCALAR &dz)
 {
   int k,l;
   FFT_SCALAR r1,r2,r3;
@@ -608,3 +588,28 @@ void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCAL
     r1d[2][k] = r3;
   }
 }
+
+/* ----------------------------------------------------------------------
+   charge assignment into drho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
+			      const FFT_SCALAR &dy, const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-2; l >= 0; l--) {
+      r1 = drho_coeff[l][k] + r1*dx;
+      r2 = drho_coeff[l][k] + r2*dy;
+      r3 = drho_coeff[l][k] + r3*dz;
+    }
+    d1d[0][k] = r1;
+    d1d[1][k] = r2;
+    d1d[2][k] = r3;
+  }
+}
diff --git a/src/USER-OMP/pppm_cg_omp.h b/src/USER-OMP/pppm_cg_omp.h
index 10a9d35f29..37be55eeef 100644
--- a/src/USER-OMP/pppm_cg_omp.h
+++ b/src/USER-OMP/pppm_cg_omp.h
@@ -25,23 +25,29 @@ KSpaceStyle(pppm/cg/omp,PPPMCGOMP)
 
 namespace LAMMPS_NS {
 
-  class PPPMCGOMP : public PPPMCG, public ThrOMP {
+class PPPMCGOMP : public PPPMCG, public ThrOMP {
  public:
   PPPMCGOMP(class LAMMPS *, int, char **);
-  virtual void compute(int, int);
-  virtual void setup();
   virtual ~PPPMCGOMP () {};
+  virtual void compute(int, int);
 
  protected:
   virtual void allocate();
   virtual void deallocate();
-  virtual void fieldforce();
-  virtual void fieldforce_peratom();
-  virtual void make_rho();
 
+  virtual void compute_gf_ik();
+  virtual void compute_gf_ad();
+
+  virtual void fieldforce_ik();
+  virtual void fieldforce_ad();
+  virtual void fieldforce_peratom();
+//  virtual void make_rho();
+
+ private:
   void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
                          const FFT_SCALAR &, const FFT_SCALAR &);
-//  void compute_rho_coeff();
+  void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+			  const FFT_SCALAR &, const FFT_SCALAR &);
 //  void slabcorr(int);
 
 };
diff --git a/src/USER-OMP/pppm_omp.cpp b/src/USER-OMP/pppm_omp.cpp
index 5c400b6f48..d45cc492ed 100644
--- a/src/USER-OMP/pppm_omp.cpp
+++ b/src/USER-OMP/pppm_omp.cpp
@@ -19,9 +19,12 @@
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
+#include "error.h"
+#include "fix_omp.h"
 #include "force.h"
 #include "memory.h"
 #include "math_const.h"
+#include "math_special.h"
 
 #include <string.h>
 #include <math.h>
@@ -29,13 +32,12 @@
 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
+using namespace MathSpecial;
 
 #ifdef FFT_SINGLE
 #define ZEROF 0.0f
-#define ONEF  1.0f
 #else
 #define ZEROF 0.0
-#define ONEF  1.0
 #endif
 
 #define EPS_HOC 1.0e-7
@@ -56,7 +58,27 @@ void PPPMOMP::allocate()
 {
   PPPM::allocate();
 
-  const int nthreads = comm->nthreads;
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    thr->init_pppm(order,memory);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   free memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMOMP::deallocate()
+{
+  PPPM::deallocate();
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
@@ -67,153 +89,26 @@ void PPPMOMP::allocate()
 #else
     const int tid = 0;
 #endif
-
-    FFT_SCALAR **rho1d_thr;
-    memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr");
     ThrData *thr = fix->get_thr(tid);
-    thr->init_pppm(static_cast<void *>(rho1d_thr));
-  }
-
-  const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1;
-
-  // reallocate density brick, so it fits our needs
-  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_brick");
-}
-
-// NOTE: special version of reduce_data for FFT_SCALAR data type.
-// reduce per thread data into the first part of the data
-// array that is used for the non-threaded parts and reset
-// the temporary storage to 0.0. this routine depends on
-// multi-dimensional arrays like force stored in this order
-// x1,y1,z1,x2,y2,z2,...
-// we need to post a barrier to wait until all threads are done
-// writing to the array.
-static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
-{
-#if defined(_OPENMP)
-  // NOOP in non-threaded execution.
-  if (nthreads == 1) return;
-#pragma omp barrier
-  {
-    const int nvals = ndim*nall;
-    const int idelta = nvals/nthreads + 1;
-    const int ifrom = tid*idelta;
-    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
-
-    // this if protects against having more threads than atoms
-    if (ifrom < nall) {
-      for (int m = ifrom; m < ito; ++m) {
-        for (int n = 1; n < nthreads; ++n) {
-          dall[m] += dall[n*nvals + m];
-          dall[n*nvals + m] = 0.0;
-        }
-      }
-    }
-  }
-#else
-  // NOOP in non-threaded execution.
-  return;
-#endif
-}
-
-/* ----------------------------------------------------------------------
-   free memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPMOMP::deallocate()
-{
-  PPPM::deallocate();
-  for (int i=0; i < comm->nthreads; ++i) {
-    ThrData * thr = fix->get_thr(i);
-    FFT_SCALAR ** rho1d_thr = static_cast<FFT_SCALAR **>(thr->get_rho1d());
-    memory->destroy2d_offset(rho1d_thr,-order/2);
+    thr->init_pppm(-order,memory);
   }
 }
 
 /* ----------------------------------------------------------------------
-   adjust PPPM coeffs, called initially and whenever volume has changed
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
 ------------------------------------------------------------------------- */
 
-void PPPMOMP::setup()
+void PPPMOMP::compute_gf_ik()
 {
-  int i,j,k,n;
-  double *prd;
-
-  // volume-dependent factors
-  // adjust z dimension for 2d slab PPPM
-  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
-
-  if (triclinic == 0) prd = domain->prd;
-  else prd = domain->prd_lamda;
+  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
 
   const double xprd = prd[0];
   const double yprd = prd[1];
   const double zprd = prd[2];
   const double zprd_slab = zprd*slab_volfactor;
-  volume = xprd * yprd * zprd_slab;
-
-  delxinv = nx_pppm/xprd;
-  delyinv = ny_pppm/yprd;
-  delzinv = nz_pppm/zprd_slab;
-
-  delvolinv = delxinv*delyinv*delzinv;
-
-  const double unitkx = (2.0*MY_PI/xprd);
-  const double unitky = (2.0*MY_PI/yprd);
-  const double unitkz = (2.0*MY_PI/zprd_slab);
-
-  // fkx,fky,fkz for my FFT grid pts
-
-  double per;
-
-  for (i = nxlo_fft; i <= nxhi_fft; i++) {
-    per = i - nx_pppm*(2*i/nx_pppm);
-    fkx[i] = unitkx*per;
-  }
-
-  for (i = nylo_fft; i <= nyhi_fft; i++) {
-    per = i - ny_pppm*(2*i/ny_pppm);
-    fky[i] = unitky*per;
-  }
-
-  for (i = nzlo_fft; i <= nzhi_fft; i++) {
-    per = i - nz_pppm*(2*i/nz_pppm);
-    fkz[i] = unitkz*per;
-  }
-
-  // virial coefficients
-
-  double sqk,vterm;
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++) {
-    for (j = nylo_fft; j <= nyhi_fft; j++) {
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
-        if (sqk == 0.0) {
-          vg[n][0] = 0.0;
-          vg[n][1] = 0.0;
-          vg[n][2] = 0.0;
-          vg[n][3] = 0.0;
-          vg[n][4] = 0.0;
-          vg[n][5] = 0.0;
-        } else {
-          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
-          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
-          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
-          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
-          vg[n][3] = vterm*fkx[i]*fky[j];
-          vg[n][4] = vterm*fkx[i]*fkz[k];
-          vg[n][5] = vterm*fky[j]*fkz[k];
-        }
-        n++;
-      }
-    }
-  }
-
-  // modified (Hockney-Eastwood) Coulomb Green's function
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
 
   const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
                                     pow(-log(EPS_HOC),0.25));
@@ -221,93 +116,188 @@ void PPPMOMP::setup()
                                     pow(-log(EPS_HOC),0.25));
   const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
                                     pow(-log(EPS_HOC),0.25));
+  const int numk = nxhi_fft - nxlo_fft + 1;
+  const int numl = nyhi_fft - nylo_fft + 1;
 
-  const double form = 1.0;
+  const int twoorder = 2*order;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
-    int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m;
-    double snx,sny,snz,sqk;
+    double snx,sny,snz;
     double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
     double sum1,dot1,dot2;
     double numerator,denominator;
-    const double gew2 = -4.0*g_ewald*g_ewald;
+    double sqk;
 
-    const int nnx = nxhi_fft-nxlo_fft+1;
-    const int nny = nyhi_fft-nylo_fft+1;
+    int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;
 
-    loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);
+    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
 
-    for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    for (n = nfrom; n < nto; ++n) {
+      m = n / (numl*numk);
+      l = (n - m*numl*numk) / numk;
+      k = n - m*numl*numk - l*numk;
+      m += nzlo_fft;
+      l += nylo_fft;
+      k += nxlo_fft;
 
-      const double fkzm = fkz[m];
-      snz = sin(0.5*fkzm*zprd_slab/nz_pppm);
-      snz *= snz;
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
 
-      for (l = nylo_fft; l <= nyhi_fft; l++) {
-        const double fkyl = fky[l];
-        sny = sin(0.5*fkyl*yprd/ny_pppm);
-        sny *= sny;
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
 
-        for (k = nxlo_fft; k <= nxhi_fft; k++) {
+      kper = k - nx_pppm*(2*k/nx_pppm);
+      snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
 
-          /* only compute the part designated to this thread */
-          nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
-          if ((nn < nnfrom) || (nn >=nnto)) continue;
+      sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
 
-          const double fkxk = fkx[k];
-          snx = sin(0.5*fkxk*xprd/nx_pppm);
-          snx *= snx;
+      if (sqk != 0.0) {
+        numerator = 12.5663706/sqk;
+        denominator = gf_denom(snx,sny,snz);
+        sum1 = 0.0;
 
-          sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm;
+        for (nx = -nbx; nx <= nbx; nx++) {
+          qx = unitkx*(kper+nx_pppm*nx);
+          sx = exp(-0.25*square(qx/g_ewald));
+          argx = 0.5*qx*xprd/nx_pppm;
+          wx = powsinxx(argx,twoorder);
 
-          if (sqk != 0.0) {
-            numerator = form*MY_4PI/sqk;
-            denominator = gf_denom(snx,sny,snz);
-            sum1 = 0.0;
-            const double dorder = static_cast<double>(order);
-            for (nx = -nbx; nx <= nbx; nx++) {
-              qx = fkxk + unitkx*nx_pppm*nx;
-              sx = exp(qx*qx/gew2);
-              wx = 1.0;
-              argx = 0.5*qx*xprd/nx_pppm;
-              if (argx != 0.0) wx = pow(sin(argx)/argx,dorder);
-              wx *=wx;
+          for (ny = -nby; ny <= nby; ny++) {
+            qy = unitky*(lper+ny_pppm*ny);
+            sy = exp(-0.25*square(qy/g_ewald));
+            argy = 0.5*qy*yprd/ny_pppm;
+            wy = powsinxx(argy,twoorder);
 
-              for (ny = -nby; ny <= nby; ny++) {
-                qy = fkyl + unitky*ny_pppm*ny;
-                sy = exp(qy*qy/gew2);
-                wy = 1.0;
-                argy = 0.5*qy*yprd/ny_pppm;
-                if (argy != 0.0) wy = pow(sin(argy)/argy,dorder);
-                wy *= wy;
+            for (nz = -nbz; nz <= nbz; nz++) {
+              qz = unitkz*(mper+nz_pppm*nz);
+              sz = exp(-0.25*square(qz/g_ewald));
+              argz = 0.5*qz*zprd_slab/nz_pppm;
+              wz = powsinxx(argz,twoorder);
 
-                for (nz = -nbz; nz <= nbz; nz++) {
-                  qz = fkzm + unitkz*nz_pppm*nz;
-                  sz = exp(qz*qz/gew2);
-                  wz = 1.0;
-                  argz = 0.5*qz*zprd_slab/nz_pppm;
-                  if (argz != 0.0) wz = pow(sin(argz)/argz,dorder);
-                  wz *= wz;
-
-                  dot1 = fkxk*qx + fkyl*qy + fkzm*qz;
-                  dot2 = qx*qx+qy*qy+qz*qz;
-                  sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
-                }
-              }
+              dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+              dot2 = qx*qx+qy*qy+qz*qz;
+              sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
             }
-            greensfn[nn] = numerator*sum1/denominator;
-          } else greensfn[nn] = 0.0;
+          }
         }
-      }
+        greensfn[n] = numerator*sum1/denominator;
+      } else greensfn[n] = 0.0;
     }
-  }
+  } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
-   run the regular toplevel compute method from plain PPPPM
+   compute optimized Green's function for energy calculation
+------------------------------------------------------------------------- */
+
+void PPPMOMP::compute_gf_ad()
+{
+
+  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  const int numk = nxhi_fft - nxlo_fft + 1;
+  const int numl = nyhi_fft - nylo_fft + 1;
+
+  const int twoorder = 2*order;
+  double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
+#endif
+  {
+    double snx,sny,snz,sqk;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double numerator,denominator;
+    int k,l,m,kper,lper,mper,n,nfrom,nto,tid;
+
+    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
+
+    for (n = nfrom; n < nto; ++n) {
+
+      m = n / (numl*numk);
+      l = (n - m*numl*numk) / numk;
+      k = n - m*numl*numk - l*numk;
+      m += nzlo_fft;
+      l += nylo_fft;
+      k += nxlo_fft;
+
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      qz = unitkz*mper;
+      snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
+      sz = exp(-0.25*square(qz/g_ewald));
+      argz = 0.5*qz*zprd_slab/nz_pppm;
+      wz = powsinxx(argz,twoorder);
+
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      qy = unitky*lper;
+      sny = square(sin(0.5*qy*yprd/ny_pppm));
+      sy = exp(-0.25*square(qy/g_ewald));
+      argy = 0.5*qy*yprd/ny_pppm;
+      wy = powsinxx(argy,twoorder);
+
+      kper = k - nx_pppm*(2*k/nx_pppm);
+      qx = unitkx*kper;
+      snx = square(sin(0.5*qx*xprd/nx_pppm));
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
+
+      sqk = qx*qx + qy*qy + qz*qz;
+
+      if (sqk != 0.0) {
+        numerator = MY_4PI/sqk;
+        denominator = gf_denom(snx,sny,snz);
+        greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
+        sf0 += sf_precoeff1[n]*greensfn[n];
+        sf1 += sf_precoeff2[n]*greensfn[n];
+        sf2 += sf_precoeff3[n]*greensfn[n];
+        sf3 += sf_precoeff4[n]*greensfn[n];
+        sf4 += sf_precoeff5[n]*greensfn[n];
+        sf5 += sf_precoeff6[n]*greensfn[n];
+      } else {
+        greensfn[n] = 0.0;
+        sf0 += sf_precoeff1[n]*greensfn[n];
+        sf1 += sf_precoeff2[n]*greensfn[n];
+        sf2 += sf_precoeff3[n]*greensfn[n];
+        sf3 += sf_precoeff4[n]*greensfn[n];
+        sf4 += sf_precoeff5[n]*greensfn[n];
+        sf5 += sf_precoeff6[n]*greensfn[n];
+      }
+    }
+  } // end of paralle region
+  
+  // compute the coefficients for the self-force correction
+
+  double prex, prey, prez, tmp[6];
+  prex = prey = prez = MY_PI/volume;
+  prex *= nx_pppm/xprd;
+  prey *= ny_pppm/yprd;
+  prez *= nz_pppm/zprd_slab;
+  tmp[0] = sf0 * prex;
+  tmp[1] = sf1 * prex*2;
+  tmp[2] = sf2 * prey;
+  tmp[3] = sf3 * prey*2;
+  tmp[4] = sf4 * prez;
+  tmp[5] = sf5 * prez*2;
+
+  // communicate values with other procs
+
+  MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
+}
+
+/* ----------------------------------------------------------------------
+   run the regular toplevel compute method from plain PPPM
    which will have individual methods replaced by our threaded
    versions and then call the obligatory force reduction.
 ------------------------------------------------------------------------- */
@@ -332,94 +322,10 @@ void PPPMOMP::compute(int eflag, int vflag)
 }
 
 /* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = charge "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid
+   interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
 
-void PPPMOMP::make_rho()
-{
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
-  const int nthreads = comm->nthreads;
-  const int nlocal = atom->nlocal;
-
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = nlocal;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int tid = 0;
-    const int ifrom = 0;
-    const int ito = nlocal;
-#endif
-
-    int l,m,n,nx,ny,nz,mx,my,mz;
-    FFT_SCALAR dx,dy,dz,x0,y0,z0;
-
-    // set up clear 3d density array
-    const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
-    FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
-    memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
-
-    ThrData *thr = fix->get_thr(tid);
-    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
-
-    // loop over my charges, add their contribution to nearby grid points
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // (dx,dy,dz) = distance to "lower left" grid pt
-    // (mx,my,mz) = global coords of moving stencil pt
-
-    // this if protects against having more threads than local atoms
-    if (ifrom < nlocal) {
-      for (int i = ifrom; i < ito; i++) {
-
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-        compute_rho1d_thr(r1d,dx,dy,dz);
-
-        z0 = delvolinv * q[i];
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          y0 = z0*r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            x0 = y0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              db[mz][my][mx] += x0*r1d[0][l];
-            }
-          }
-        }
-      }
-    }
-#if defined(_OPENMP)
-    // reduce 3d density array
-    if (nthreads > 1) {
-      data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
-    }
-#endif
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
-------------------------------------------------------------------------- */
-
-void PPPMOMP::fieldforce()
+void PPPMOMP::fieldforce_ik()
 {
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -427,79 +333,170 @@ void PPPMOMP::fieldforce()
   // (mx,my,mz) = global coords of moving stencil pt
   // ek = 3 components of E-field on particle
 
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
+  const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+
+  const double qqrd2e = force->qqrd2e;
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+
   const int nthreads = comm->nthreads;
   const int nlocal = atom->nlocal;
-  const double qqrd2e = force->qqrd2e;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = nlocal;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int ifrom = 0;
-    const int ito = nlocal;
-    const int tid = 0;
-#endif
+    FFT_SCALAR x0,y0,z0,ekx,eky,ekz;
+    int i,ifrom,ito,tid,l,m,n,mx,my,mz;
+
+    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
+
+    // get per thread data
     ThrData *thr = fix->get_thr(tid);
-    double * const * const f = thr->get_f();
-    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
 
-    int l,m,n,nx,ny,nz,mx,my,mz;
-    FFT_SCALAR dx,dy,dz,x0,y0,z0;
-    FFT_SCALAR ekx,eky,ekz;
+    for (i = ifrom; i < ito; ++i) {
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+      const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv;
 
-    // this if protects against having more threads than local atoms
-    if (ifrom < nlocal) {
-      for (int i = ifrom; i < ito; i++) {
+      compute_rho1d_thr(r1d,dx,dy,dz);
 
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-        compute_rho1d_thr(r1d,dx,dy,dz);
-
-        ekx = eky = ekz = ZEROF;
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          z0 = r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            y0 = z0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              x0 = y0*r1d[0][l];
-              ekx -= x0*vdx_brick[mz][my][mx];
-              eky -= x0*vdy_brick[mz][my][mx];
-              ekz -= x0*vdz_brick[mz][my][mx];
-            }
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        z0 = r1d[2][n];
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          y0 = z0*r1d[1][m];
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+            x0 = y0*r1d[0][l];
+            ekx -= x0*vdx_brick[mz][my][mx];
+            eky -= x0*vdy_brick[mz][my][mx];
+            ekz -= x0*vdz_brick[mz][my][mx];
           }
         }
-
-        // convert E-field to force
-        const double qfactor = qqrd2e*scale*q[i];
-        f[i][0] += qfactor*ekx;
-        f[i][1] += qfactor*eky;
-        f[i][2] += qfactor*ekz;
       }
+
+      // convert E-field to force
+
+      const double qfactor = qqrd2e * scale * q[i];
+      f[i].x += qfactor*ekx;
+      f[i].y += qfactor*eky;
+      if (slabflag != 2) f[i].z += qfactor*ekz;
     }
-  }
+  } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
- interpolate from grid to get per-atom energy/virial
- ------------------------------------------------------------------------- */
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPMOMP::fieldforce_ad()
+{
+  const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
+  const double hx_inv = nx_pppm/prd[0];
+  const double hy_inv = ny_pppm/prd[1];
+  const double hz_inv = nz_pppm/prd[2];
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
+  const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const double qqrd2e = force->qqrd2e;
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    double s1,s2,s3,sf;
+    FFT_SCALAR ekx,eky,ekz;
+    int i,ifrom,ito,tid,l,m,n,mx,my,mz;
+
+    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
+
+    for (i = ifrom; i < ito; ++i) {
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+      const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv;
+
+      compute_rho1d_thr(r1d,dx,dy,dz);
+      compute_drho1d_thr(d1d,dx,dy,dz);
+
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+            ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+            eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+            ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
+          }
+        }
+      }
+      ekx *= hx_inv;
+      eky *= hy_inv;
+      ekz *= hz_inv;
+
+      // convert E-field to force and substract self forces
+
+      const double qi = q[i];
+      const double qfactor = qqrd2e * scale * qi;
+
+      s1 = x[i].x*hx_inv;
+      sf = sf_coeff[0]*sin(MY_2PI*s1);
+      sf += sf_coeff[1]*sin(MY_4PI*s1);
+      sf *= 2.0*qi;
+      f[i].x += qfactor*(ekx - sf);
+
+      s2 = x[i].y*hy_inv;
+      sf = sf_coeff[2]*sin(MY_2PI*s2);
+      sf += sf_coeff[3]*sin(MY_4PI*s2);
+      sf *= 2.0*qi;
+      f[i].y += qfactor*(eky - sf);
+
+      s3 = x[i].z*hz_inv;
+      sf = sf_coeff[4]*sin(MY_2PI*s3);
+      sf += sf_coeff[5]*sin(MY_4PI*s3);
+      sf *= 2.0*qi;
+      if (slabflag != 2) f[i].z += qfactor*(ekz - sf);
+    }
+  } // end of parallel region
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
 
 void PPPMOMP::fieldforce_peratom()
 {
@@ -508,8 +505,8 @@ void PPPMOMP::fieldforce_peratom()
   // (dx,dy,dz) = distance to "lower left" grid pt
   // (mx,my,mz) = global coords of moving stencil pt
 
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
   const int nthreads = comm->nthreads;
   const int nlocal = atom->nlocal;
 
@@ -517,73 +514,61 @@ void PPPMOMP::fieldforce_peratom()
 #pragma omp parallel default(none)
 #endif
   {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = nlocal;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int ifrom = 0;
-    const int ito = nlocal;
-    const int tid = 0;
-#endif
-    ThrData *thr = fix->get_thr(tid);
-    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
-
-    int i,l,m,n,nx,ny,nz,mx,my,mz;
     FFT_SCALAR dx,dy,dz,x0,y0,z0;
     FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+    int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
 
-    // this if protects against having more threads than local atoms
-    if (ifrom < nlocal) {
-      for (int i = ifrom; i < ito; i++) {
+    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
 
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
 
-        compute_rho1d_thr(r1d,dx,dy,dz);
+    for (i = ifrom; i < ito; ++i) {
+      nx = part2grid[i][0];
+      ny = part2grid[i][1];
+      nz = part2grid[i][2];
+      dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
+      dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
+      dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
 
-        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          z0 = r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            y0 = z0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              x0 = y0*r1d[0][l];
-              if (eflag_atom) u += x0*u_brick[mz][my][mx];
-              if (vflag_atom) {
-                v0 += x0*v0_brick[mz][my][mx];
-                v1 += x0*v1_brick[mz][my][mx];
-                v2 += x0*v2_brick[mz][my][mx];
-                v3 += x0*v3_brick[mz][my][mx];
-                v4 += x0*v4_brick[mz][my][mx];
-                v5 += x0*v5_brick[mz][my][mx];
-              }
+      compute_rho1d_thr(r1d,dx,dy,dz);
+
+      u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        z0 = r1d[2][n];
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          y0 = z0*r1d[1][m];
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+            x0 = y0*r1d[0][l];
+            if (eflag_atom) u += x0*u_brick[mz][my][mx];
+            if (vflag_atom) {
+              v0 += x0*v0_brick[mz][my][mx];
+              v1 += x0*v1_brick[mz][my][mx];
+              v2 += x0*v2_brick[mz][my][mx];
+              v3 += x0*v3_brick[mz][my][mx];
+              v4 += x0*v4_brick[mz][my][mx];
+              v5 += x0*v5_brick[mz][my][mx];
             }
           }
         }
+      }
 
-        if (eflag_atom) eatom[i] += q[i]*u;
-        if (vflag_atom) {
-          vatom[i][0] += v0;
-          vatom[i][1] += v1;
-          vatom[i][2] += v2;
-          vatom[i][3] += v3;
-          vatom[i][4] += v4;
-          vatom[i][5] += v5;
-        }
+      const double qi = q[i];
+      if (eflag_atom) eatom[i] += qi*u;
+      if (vflag_atom) {
+        vatom[i][0] += qi*v0;
+        vatom[i][1] += qi*v1;
+        vatom[i][2] += qi*v2;
+        vatom[i][3] += qi*v3;
+        vatom[i][4] += qi*v4;
+        vatom[i][5] += qi*v5;
       }
     }
-  }
+  } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
@@ -609,3 +594,28 @@ void PPPMOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR
     r1d[2][k] = r3;
   }
 }
+
+/* ----------------------------------------------------------------------
+   charge assignment into drho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPMOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
+			      const FFT_SCALAR &dy, const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-2; l >= 0; l--) {
+      r1 = drho_coeff[l][k] + r1*dx;
+      r2 = drho_coeff[l][k] + r2*dy;
+      r3 = drho_coeff[l][k] + r3*dz;
+    }
+    d1d[0][k] = r1;
+    d1d[1][k] = r2;
+    d1d[2][k] = r3;
+  }
+}
diff --git a/src/USER-OMP/pppm_omp.h b/src/USER-OMP/pppm_omp.h
index b3c069844c..835398b79f 100644
--- a/src/USER-OMP/pppm_omp.h
+++ b/src/USER-OMP/pppm_omp.h
@@ -25,23 +25,29 @@ KSpaceStyle(pppm/omp,PPPMOMP)
 
 namespace LAMMPS_NS {
 
-  class PPPMOMP : public PPPM, public ThrOMP {
+class PPPMOMP : public PPPM, public ThrOMP {
  public:
   PPPMOMP(class LAMMPS *, int, char **);
   virtual ~PPPMOMP () {};
-  virtual void setup();
   virtual void compute(int, int);
 
  protected:
   virtual void allocate();
   virtual void deallocate();
-  virtual void fieldforce();
-  virtual void fieldforce_peratom();
-  virtual void make_rho();
 
+  virtual void compute_gf_ik();
+  virtual void compute_gf_ad();
+
+  virtual void fieldforce_ik();
+  virtual void fieldforce_ad();
+  virtual void fieldforce_peratom();
+//  virtual void make_rho();
+
+ private:
   void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
                          const FFT_SCALAR &, const FFT_SCALAR &);
-//  void compute_rho_coeff();
+  void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+			  const FFT_SCALAR &, const FFT_SCALAR &);
 //  void slabcorr(int);
 
 };
diff --git a/src/USER-OMP/pppm_tip4p_omp.cpp b/src/USER-OMP/pppm_tip4p_omp.cpp
index e7c6cbe106..92c8098f1c 100644
--- a/src/USER-OMP/pppm_tip4p_omp.cpp
+++ b/src/USER-OMP/pppm_tip4p_omp.cpp
@@ -19,9 +19,12 @@
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
+#include "error.h"
+#include "fix_omp.h"
 #include "force.h"
 #include "memory.h"
 #include "math_const.h"
+#include "math_special.h"
 
 #include <string.h>
 #include <math.h>
@@ -29,74 +32,294 @@
 #include "suffix.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
-
-#define OFFSET 16384
+using namespace MathSpecial;
 
 #ifdef FFT_SINGLE
 #define ZEROF 0.0f
-#define ONEF  1.0f
 #else
 #define ZEROF 0.0
-#define ONEF  1.0
 #endif
 
+#define EPS_HOC 1.0e-7
+#define OFFSET 16384
+
 /* ---------------------------------------------------------------------- */
 
 PPPMTIP4POMP::PPPMTIP4POMP(LAMMPS *lmp, int narg, char **arg) :
-  PPPMOMP(lmp, narg, arg)
+  PPPMTIP4P(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
 {
-  tip4pflag = 1;
   suffix_flag |= Suffix::OMP;
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
 
-// NOTE: special version of reduce_data for FFT_SCALAR data type.
-// reduce per thread data into the first part of the data
-// array that is used for the non-threaded parts and reset
-// the temporary storage to 0.0. this routine depends on
-// multi-dimensional arrays like force stored in this order
-// x1,y1,z1,x2,y2,z2,...
-// we need to post a barrier to wait until all threads are done
-// writing to the array.
-static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
+void PPPMTIP4POMP::allocate()
 {
-#if defined(_OPENMP)
-  // NOOP in non-threaded execution.
-  if (nthreads == 1) return;
-#pragma omp barrier
-  {
-    const int nvals = ndim*nall;
-    const int idelta = nvals/nthreads + 1;
-    const int ifrom = tid*idelta;
-    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
+  PPPMTIP4P::allocate();
 
-    // this if protects against having more threads than atoms
-    if (ifrom < nall) {
-      for (int m = ifrom; m < ito; ++m) {
-        for (int n = 1; n < nthreads; ++n) {
-          dall[m] += dall[n*nvals + m];
-          dall[n*nvals + m] = 0.0;
-        }
-      }
-    }
-  }
-#else
-  // NOOP in non-threaded execution.
-  return;
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
 #endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    thr->init_pppm(order,memory);
+  }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   free memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
 
-void PPPMTIP4POMP::init()
+void PPPMTIP4POMP::deallocate()
 {
-  // TIP4P PPPM requires newton on, b/c it computes forces on ghost atoms
+  PPPMTIP4P::deallocate();
 
-  if (force->newton == 0)
-    error->all(FLERR,"Kspace style pppm/tip4p/omp requires newton on");
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    thr->init_pppm(-order,memory);
+  }
+}
 
-  PPPMOMP::init();
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::compute_gf_ik()
+{
+  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int numk = nxhi_fft - nxlo_fft + 1;
+  const int numl = nyhi_fft - nylo_fft + 1;
+
+  const int twoorder = 2*order;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    double snx,sny,snz;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double sum1,dot1,dot2;
+    double numerator,denominator;
+    double sqk;
+
+    int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;
+
+    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
+
+    for (n = nfrom; n < nto; ++n) {
+      m = n / (numl*numk);
+      l = (n - m*numl*numk) / numk;
+      k = n - m*numl*numk - l*numk;
+      m += nzlo_fft;
+      l += nylo_fft;
+      k += nxlo_fft;
+
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+
+      kper = k - nx_pppm*(2*k/nx_pppm);
+      snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+
+      sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+      if (sqk != 0.0) {
+        numerator = 12.5663706/sqk;
+        denominator = gf_denom(snx,sny,snz);
+        sum1 = 0.0;
+
+        for (nx = -nbx; nx <= nbx; nx++) {
+          qx = unitkx*(kper+nx_pppm*nx);
+          sx = exp(-0.25*square(qx/g_ewald));
+          argx = 0.5*qx*xprd/nx_pppm;
+          wx = powsinxx(argx,twoorder);
+
+          for (ny = -nby; ny <= nby; ny++) {
+            qy = unitky*(lper+ny_pppm*ny);
+            sy = exp(-0.25*square(qy/g_ewald));
+            argy = 0.5*qy*yprd/ny_pppm;
+            wy = powsinxx(argy,twoorder);
+
+            for (nz = -nbz; nz <= nbz; nz++) {
+              qz = unitkz*(mper+nz_pppm*nz);
+              sz = exp(-0.25*square(qz/g_ewald));
+              argz = 0.5*qz*zprd_slab/nz_pppm;
+              wz = powsinxx(argz,twoorder);
+
+              dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+              dot2 = qx*qx+qy*qy+qz*qz;
+              sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+            }
+          }
+        }
+        greensfn[n] = numerator*sum1/denominator;
+      } else greensfn[n] = 0.0;
+    }
+  } // end of parallel region
+}
+
+/* ----------------------------------------------------------------------
+   compute optimized Green's function for energy calculation
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::compute_gf_ad()
+{
+
+  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  const int numk = nxhi_fft - nxlo_fft + 1;
+  const int numl = nyhi_fft - nylo_fft + 1;
+
+  const int twoorder = 2*order;
+  double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
+#endif
+  {
+    double snx,sny,snz,sqk;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double numerator,denominator;
+    int k,l,m,kper,lper,mper,n,nfrom,nto,tid;
+
+    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
+
+    for (n = nfrom; n < nto; ++n) {
+
+      m = n / (numl*numk);
+      l = (n - m*numl*numk) / numk;
+      k = n - m*numl*numk - l*numk;
+      m += nzlo_fft;
+      l += nylo_fft;
+      k += nxlo_fft;
+
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      qz = unitkz*mper;
+      snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
+      sz = exp(-0.25*square(qz/g_ewald));
+      argz = 0.5*qz*zprd_slab/nz_pppm;
+      wz = powsinxx(argz,twoorder);
+
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      qy = unitky*lper;
+      sny = square(sin(0.5*qy*yprd/ny_pppm));
+      sy = exp(-0.25*square(qy/g_ewald));
+      argy = 0.5*qy*yprd/ny_pppm;
+      wy = powsinxx(argy,twoorder);
+
+      kper = k - nx_pppm*(2*k/nx_pppm);
+      qx = unitkx*kper;
+      snx = square(sin(0.5*qx*xprd/nx_pppm));
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
+
+      sqk = qx*qx + qy*qy + qz*qz;
+
+      if (sqk != 0.0) {
+        numerator = MY_4PI/sqk;
+        denominator = gf_denom(snx,sny,snz);
+        greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
+        sf0 += sf_precoeff1[n]*greensfn[n];
+        sf1 += sf_precoeff2[n]*greensfn[n];
+        sf2 += sf_precoeff3[n]*greensfn[n];
+        sf3 += sf_precoeff4[n]*greensfn[n];
+        sf4 += sf_precoeff5[n]*greensfn[n];
+        sf5 += sf_precoeff6[n]*greensfn[n];
+      } else {
+        greensfn[n] = 0.0;
+        sf0 += sf_precoeff1[n]*greensfn[n];
+        sf1 += sf_precoeff2[n]*greensfn[n];
+        sf2 += sf_precoeff3[n]*greensfn[n];
+        sf3 += sf_precoeff4[n]*greensfn[n];
+        sf4 += sf_precoeff5[n]*greensfn[n];
+        sf5 += sf_precoeff6[n]*greensfn[n];
+      }
+    }
+  } // end of paralle region
+  
+  // compute the coefficients for the self-force correction
+
+  double prex, prey, prez, tmp[6];
+  prex = prey = prez = MY_PI/volume;
+  prex *= nx_pppm/xprd;
+  prey *= ny_pppm/yprd;
+  prez *= nz_pppm/zprd_slab;
+  tmp[0] = sf0 * prex;
+  tmp[1] = sf1 * prex*2;
+  tmp[2] = sf2 * prey;
+  tmp[3] = sf3 * prey*2;
+  tmp[4] = sf4 * prez;
+  tmp[5] = sf5 * prez*2;
+
+  // communicate values with other procs
+
+  MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
+}
+
+/* ----------------------------------------------------------------------
+   run the regular toplevel compute method from plain PPPM
+   which will have individual methods replaced by our threaded
+   versions and then call the obligatory force reduction.
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::compute(int eflag, int vflag)
+{
+
+  PPPMTIP4P::compute(eflag,vflag);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
 }
 
 /* ----------------------------------------------------------------------
@@ -107,8 +330,12 @@ void PPPMTIP4POMP::init()
 
 void PPPMTIP4POMP::particle_map()
 {
-  const int * const type = atom->type;
-  const double * const * const x = atom->x;
+  const int * _noalias const type = atom->type;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
   const int nlocal = atom->nlocal;
 
   int i, flag = 0;
@@ -116,34 +343,33 @@ void PPPMTIP4POMP::particle_map()
 #pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static)
 #endif
   for (i = 0; i < nlocal; i++) {
-    int nx,ny,nz,iH1,iH2;
-    double xM[3];
+    dbl3_t xM;
+    int iH1,iH2;
 
     if (type[i] == typeO) {
-      find_M(i,iH1,iH2,xM);
+      find_M_thr(i,iH1,iH2,xM);
     } else {
-      xM[0] = x[i][0];
-      xM[1] = x[i][1];
-      xM[2] = x[i][2];
+      xM = x[i];
     }
 
     // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
     // current particle coord can be outside global and local box
     // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
 
-    nx = static_cast<int> ((xM[0]-boxlo[0])*delxinv+shift) - OFFSET;
-    ny = static_cast<int> ((xM[1]-boxlo[1])*delyinv+shift) - OFFSET;
-    nz = static_cast<int> ((xM[2]-boxlo[2])*delzinv+shift) - OFFSET;
+    const int nx = static_cast<int> ((xM.x-boxlox)*delxinv+shift) - OFFSET;
+    const int ny = static_cast<int> ((xM.y-boxloy)*delyinv+shift) - OFFSET;
+    const int nz = static_cast<int> ((xM.z-boxloz)*delzinv+shift) - OFFSET;
 
-    part2grid[i][0] = nx;
-    part2grid[i][1] = ny;
-    part2grid[i][2] = nz;
+    p2g[i].a = nx;
+    p2g[i].b = ny;
+    p2g[i].t = nz;
 
     // check that entire stencil around nx,ny,nz will fit in my 3d brick
 
     if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
         ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-        nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag++;
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+      flag++;
   }
 
   int flag_all;
@@ -160,96 +386,68 @@ void PPPMTIP4POMP::particle_map()
 
 void PPPMTIP4POMP::make_rho()
 {
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
-  const int * const type = atom->type;
-  const int nthreads = comm->nthreads;
+  const double * _noalias const q = atom->q;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const int * _noalias const type = atom->type;
+
+  dbl3_t xM;
+  FFT_SCALAR x0,y0,z0;
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+
+  int l,m,n,mx,my,mz,iH1,iH2;
+
   const int nlocal = atom->nlocal;
 
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = nlocal;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int tid = 0;
-    const int ifrom = 0;
-    const int ito = nlocal;
-#endif
+  // clear 3d density array
 
-    int l,m,n,nx,ny,nz,mx,my,mz,iH1,iH2;
-    FFT_SCALAR dx,dy,dz,x0,y0,z0;
-    double xM[3];
+  FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out];
+  memset(vec,0,ngrid*sizeof(FFT_SCALAR));
 
-    // set up clear 3d density array
-    const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
-    FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
-    memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
 
-    ThrData *thr = fix->get_thr(tid);
-    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+  for (int i = 0; i < nlocal; i++) {
+    if (type[i] == typeO) {
+      find_M_thr(i,iH1,iH2,xM);
+    } else {
+      xM = x[i];
+    }
 
-    // loop over my charges, add their contribution to nearby grid points
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // (dx,dy,dz) = distance to "lower left" grid pt
-    // (mx,my,mz) = global coords of moving stencil pt
+    const int nx = p2g[i].a;
+    const int ny = p2g[i].b;
+    const int nz = p2g[i].t;
+    const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
+    const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
+    const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
 
-    // this if protects against having more threads than local atoms
-    if (ifrom < nlocal) {
-      for (int i = ifrom; i < ito; i++) {
+    compute_rho1d(dx,dy,dz);
 
-        if (type[i] == typeO) {
-          find_M(i,iH1,iH2,xM);
-        } else {
-          xM[0] = x[i][0];
-          xM[1] = x[i][1];
-          xM[2] = x[i][2];
-        }
-
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv;
-
-        compute_rho1d_thr(r1d,dx,dy,dz);
-
-        z0 = delvolinv * q[i];
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          y0 = z0*r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            x0 = y0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              db[mz][my][mx] += x0*r1d[0][l];
-            }
-          }
+    z0 = delvolinv * q[i];
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          density_brick[mz][my][mx] += x0*rho1d[0][l];
         }
       }
     }
-#if defined(_OPENMP)
-    // reduce 3d density array
-    if (nthreads > 1) {
-      data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
-    }
-#endif
   }
 }
 
 /* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
+   interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
 
-void PPPMTIP4POMP::fieldforce()
+void PPPMTIP4POMP::fieldforce_ik()
 {
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -257,106 +455,216 @@ void PPPMTIP4POMP::fieldforce()
   // (mx,my,mz) = global coords of moving stencil pt
   // ek = 3 components of E-field on particle
 
-  const double * const q = atom->q;
-  const double * const * const x = atom->x;
-  const int * const type = atom->type;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
+  const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const int * _noalias const type = atom->type;
+
+  const double qqrd2e = force->qqrd2e;
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+
   const int nthreads = comm->nthreads;
   const int nlocal = atom->nlocal;
-  const double qqrd2e = force->qqrd2e;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
-#if defined(_OPENMP)
-    // each thread works on a fixed chunk of atoms.
-    const int tid = omp_get_thread_num();
-    const int inum = nlocal;
-    const int idelta = 1 + inum/nthreads;
-    const int ifrom = tid*idelta;
-    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
-#else
-    const int ifrom = 0;
-    const int ito = nlocal;
-    const int tid = 0;
-#endif
+    dbl3_t xM;
+    FFT_SCALAR x0,y0,z0,ekx,eky,ekz;
+    int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
+
+    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
+
+    // get per thread data
     ThrData *thr = fix->get_thr(tid);
-    double * const * const f = thr->get_f();
-    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
 
-    int l,m,n,nx,ny,nz,mx,my,mz;
-    FFT_SCALAR dx,dy,dz,x0,y0,z0;
-    FFT_SCALAR ekx,eky,ekz;
-    int iH1,iH2;
-    double xM[3], fx,fy,fz;
+    for (i = ifrom; i < ito; ++i) {
+      if (type[i] == typeO) {
+        find_M_thr(i,iH1,iH2,xM);
+      } else xM = x[i];
 
-    // this if protects against having more threads than local atoms
-    if (ifrom < nlocal) {
-      for (int i = ifrom; i < ito; i++) {
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
 
-        if (type[i] == typeO) {
-          find_M(i,iH1,iH2,xM);
-        } else {
-          xM[0] = x[i][0];
-          xM[1] = x[i][1];
-          xM[2] = x[i][2];
-        }
+      compute_rho1d_thr(r1d,dx,dy,dz);
 
-        nx = part2grid[i][0];
-        ny = part2grid[i][1];
-        nz = part2grid[i][2];
-        dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv;
-        dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv;
-        dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv;
-
-        compute_rho1d_thr(r1d,dx,dy,dz);
-
-        ekx = eky = ekz = ZEROF;
-        for (n = nlower; n <= nupper; n++) {
-          mz = n+nz;
-          z0 = r1d[2][n];
-          for (m = nlower; m <= nupper; m++) {
-            my = m+ny;
-            y0 = z0*r1d[1][m];
-            for (l = nlower; l <= nupper; l++) {
-              mx = l+nx;
-              x0 = y0*r1d[0][l];
-              ekx -= x0*vdx_brick[mz][my][mx];
-              eky -= x0*vdy_brick[mz][my][mx];
-              ekz -= x0*vdz_brick[mz][my][mx];
-            }
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        z0 = r1d[2][n];
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          y0 = z0*r1d[1][m];
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+            x0 = y0*r1d[0][l];
+            ekx -= x0*vdx_brick[mz][my][mx];
+            eky -= x0*vdy_brick[mz][my][mx];
+            ekz -= x0*vdz_brick[mz][my][mx];
           }
         }
+      }
 
-        // convert E-field to force
-        const double qfactor = qqrd2e*scale*q[i];
+      // convert E-field to force
 
-        if (type[i] != typeO) {
-          f[i][0] += qfactor*ekx;
-          f[i][1] += qfactor*eky;
-          f[i][2] += qfactor*ekz;
+      const double qfactor = qqrd2e * scale * q[i];
+      if (type[i] != typeO) {
+        f[i].x += qfactor*ekx;
+        f[i].y += qfactor*eky;
+        if (slabflag != 2) f[i].z += qfactor*ekz;
 
-        } else {
-          fx = qfactor * ekx;
-          fy = qfactor * eky;
-          fz = qfactor * ekz;
-          find_M(i,iH1,iH2,xM);
+      } else {
+        const double fx = qfactor * ekx;
+        const double fy = qfactor * eky;
+        const double fz = qfactor * ekz;
 
-          f[i][0] += fx*(1 - alpha);
-          f[i][1] += fy*(1 - alpha);
-          f[i][2] += fz*(1 - alpha);
+        f[i].x += fx*(1 - alpha);
+        f[i].y += fy*(1 - alpha);
+        if (slabflag != 2) f[i].z += fz*(1 - alpha);
 
-          f[iH1][0] += 0.5*alpha*fx;
-          f[iH1][1] += 0.5*alpha*fy;
-          f[iH1][2] += 0.5*alpha*fz;
+        f[iH1].x += 0.5*alpha*fx;
+        f[iH1].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
 
-          f[iH2][0] += 0.5*alpha*fx;
-          f[iH2][1] += 0.5*alpha*fy;
-          f[iH2][2] += 0.5*alpha*fz;
-        }
+        f[iH2].x += 0.5*alpha*fx;
+        f[iH2].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
       }
     }
-  }
+  } // end of parallel region
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::fieldforce_ad()
+{
+  const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
+  const double hx_inv = nx_pppm/prd[0];
+  const double hy_inv = ny_pppm/prd[1];
+  const double hz_inv = nz_pppm/prd[2];
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
+  const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const int * _noalias const type = atom->type;
+
+  const double qqrd2e = force->qqrd2e;
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    double s1,s2,s3,sf;
+    dbl3_t xM;
+    FFT_SCALAR ekx,eky,ekz;
+    int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
+
+    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
+
+    for (i = ifrom; i < ito; ++i) {
+      if (type[i] == typeO) {
+        find_M_thr(i,iH1,iH2,xM);
+      } else xM = x[i];
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
+
+      compute_rho1d_thr(r1d,dx,dy,dz);
+      compute_drho1d_thr(d1d,dx,dy,dz);
+
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+            ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+            eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+            ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
+          }
+        }
+      }
+      ekx *= hx_inv;
+      eky *= hy_inv;
+      ekz *= hz_inv;
+
+      // convert E-field to force and substract self forces
+
+      const double qi = q[i];
+      const double qfactor = qqrd2e * scale * qi;
+
+      s1 = x[i].x*hx_inv;
+      sf = sf_coeff[0]*sin(MY_2PI*s1);
+      sf += sf_coeff[1]*sin(MY_4PI*s1);
+      sf *= 2.0*qi;
+      const double fx = qfactor*(ekx - sf);
+
+      s2 = x[i].y*hy_inv;
+      sf = sf_coeff[2]*sin(MY_2PI*s2);
+      sf += sf_coeff[3]*sin(MY_4PI*s2);
+      sf *= 2.0*qi;
+      const double fy = qfactor*(eky - sf);
+
+      s3 = x[i].z*hz_inv;
+      sf = sf_coeff[4]*sin(MY_2PI*s3);
+      sf += sf_coeff[5]*sin(MY_4PI*s3);
+      sf *= 2.0*qi;
+      const double fz = qfactor*(ekz - sf);
+
+      if (type[i] != typeO) {
+        f[i].x += fx;
+        f[i].y += fy;
+        if (slabflag != 2) f[i].z += fz;
+
+      } else {
+        f[i].x += fx*(1 - alpha);
+        f[i].y += fy*(1 - alpha);
+        if (slabflag != 2) f[i].z += fz*(1 - alpha);
+
+        f[iH1].x += 0.5*alpha*fx;
+        f[iH1].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
+
+        f[iH2].x += 0.5*alpha*fx;
+        f[iH2].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
+      }
+    }
+  } // end of parallel region
 }
 
 /* ----------------------------------------------------------------------
@@ -365,7 +673,7 @@ void PPPMTIP4POMP::fieldforce()
   also return local indices iH1,iH2 of H atoms
 ------------------------------------------------------------------------- */
 
-void PPPMTIP4POMP::find_M(int i, int &iH1, int &iH2, double *xM)
+void PPPMTIP4POMP::find_M_thr(int i, int &iH1, int &iH2, dbl3_t &xM)
 {
   iH1 = atom->map(atom->tag[i] + 1);
   iH2 = atom->map(atom->tag[i] + 2);
@@ -374,19 +682,69 @@ void PPPMTIP4POMP::find_M(int i, int &iH1, int &iH2, double *xM)
   if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
     error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
 
-  const double * const * const x = atom->x;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
 
-  double delx1 = x[iH1][0] - x[i][0];
-  double dely1 = x[iH1][1] - x[i][1];
-  double delz1 = x[iH1][2] - x[i][2];
+  double delx1 = x[iH1].x - x[i].x;
+  double dely1 = x[iH1].y - x[i].y;
+  double delz1 = x[iH1].z - x[i].z;
   domain->minimum_image(delx1,dely1,delz1);
 
-  double delx2 = x[iH2][0] - x[i][0];
-  double dely2 = x[iH2][1] - x[i][1];
-  double delz2 = x[iH2][2] - x[i][2];
+  double delx2 = x[iH2].x - x[i].x;
+  double dely2 = x[iH2].y - x[i].y;
+  double delz2 = x[iH2].z - x[i].z;
   domain->minimum_image(delx2,dely2,delz2);
 
-  xM[0] = x[i][0] + alpha * 0.5 * (delx1 + delx2);
-  xM[1] = x[i][1] + alpha * 0.5 * (dely1 + dely2);
-  xM[2] = x[i][2] + alpha * 0.5 * (delz1 + delz2);
+  xM.x = x[i].x + alpha * 0.5 * (delx1 + delx2);
+  xM.y = x[i].y + alpha * 0.5 * (dely1 + dely2);
+  xM.z = x[i].z + alpha * 0.5 * (delz1 + delz2);
+}
+
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+void PPPMTIP4POMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
+                                const FFT_SCALAR &dy, const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-1; l >= 0; l--) {
+      r1 = rho_coeff[l][k] + r1*dx;
+      r2 = rho_coeff[l][k] + r2*dy;
+      r3 = rho_coeff[l][k] + r3*dz;
+    }
+    r1d[0][k] = r1;
+    r1d[1][k] = r2;
+    r1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into drho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
+			      const FFT_SCALAR &dy, const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-2; l >= 0; l--) {
+      r1 = drho_coeff[l][k] + r1*dx;
+      r2 = drho_coeff[l][k] + r2*dy;
+      r3 = drho_coeff[l][k] + r3*dz;
+    }
+    d1d[0][k] = r1;
+    d1d[1][k] = r2;
+    d1d[2][k] = r3;
+  }
 }
diff --git a/src/USER-OMP/pppm_tip4p_omp.h b/src/USER-OMP/pppm_tip4p_omp.h
index 11801ccb35..0ee807651c 100644
--- a/src/USER-OMP/pppm_tip4p_omp.h
+++ b/src/USER-OMP/pppm_tip4p_omp.h
@@ -20,25 +20,40 @@ KSpaceStyle(pppm/tip4p/omp,PPPMTIP4POMP)
 #ifndef LMP_PPPM_TIP4P_OMP_H
 #define LMP_PPPM_TIP4P_OMP_H
 
-#include "pppm_omp.h"
+#include "pppm_tip4p.h"
+#include "thr_omp.h"
 
 namespace LAMMPS_NS {
 
-class PPPMTIP4POMP : public PPPMOMP {
+class PPPMTIP4POMP : public PPPMTIP4P, public ThrOMP {
  public:
   PPPMTIP4POMP(class LAMMPS *, int, char **);
   virtual ~PPPMTIP4POMP () {};
-  virtual void init();
+  virtual void compute(int, int);
 
  protected:
+  virtual void allocate();
+  virtual void deallocate();
+
+  virtual void compute_gf_ik();
+  virtual void compute_gf_ad();
+
   virtual void particle_map();
-  virtual void fieldforce();
-  virtual void make_rho();
+  virtual void make_rho();  // XXX: not (yet) multi-threaded
+
+  virtual void fieldforce_ik();
+  virtual void fieldforce_ad();
+  // virtual void fieldforce_peratom();  XXX: need to benchmark first.
 
  private:
-  void find_M(int, int &, int &, double *);
+  void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+                         const FFT_SCALAR &, const FFT_SCALAR &);
+  void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+			  const FFT_SCALAR &, const FFT_SCALAR &);
 
-//  void slabcorr(int);
+  void find_M_thr(const int, int &, int &, dbl3_t &);
+
+//  void slabcorr(int);  // XXX: not (yet) multi-threaded
 
 };
 
diff --git a/src/USER-OMP/thr_data.cpp b/src/USER-OMP/thr_data.cpp
index cebed27721..c0217ed01f 100644
--- a/src/USER-OMP/thr_data.cpp
+++ b/src/USER-OMP/thr_data.cpp
@@ -21,13 +21,14 @@
 #include <string.h>
 #include <stdio.h>
 
+#include "memory.h"
+
 using namespace LAMMPS_NS;
 
 
 ThrData::ThrData(int tid)
-  : _f(NULL), _torque(NULL), _erforce(NULL), _de(NULL), _drho(NULL), _mu(NULL),
-    _lambda(NULL), _rhoB(NULL), _D_values(NULL), _rho(NULL), _fp(NULL),
-    _rho1d(NULL), _tid(tid)
+  : _f(0),_torque(0),_erforce(0),_de(0),_drho(0),_mu(0),_lambda(0),_rhoB(0),
+    _D_values(0),_rho(0),_fp(0),_rho1d(0),_drho1d(0),_tid(tid)
 {
   // nothing else to do here.
 }
@@ -136,6 +137,33 @@ void ThrData::init_eim(int nall, double *rho, double *fp)
     memset(_fp,0,nall*sizeof(double));
 }
 
+/* ----------------------------------------------------------------------
+   if order > 0 : set up per thread storage for PPPM
+   if order < 0 : free per thread storage for PPPM
+------------------------------------------------------------------------- */
+#if defined(FFT_SINGLE)
+typedef float FFT_SCALAR;
+#else
+typedef double FFT_SCALAR;
+#endif
+
+void ThrData::init_pppm(int order, Memory *memory)
+{
+  FFT_SCALAR **rho1d, **drho1d;
+  if (order > 0) {
+      memory->create2d_offset(rho1d,3,-order/2,order/2,"thr_data:rho1d");
+      memory->create2d_offset(drho1d,3,-order/2,order/2,"thr_data:drho1d");
+      _rho1d = static_cast<void *>(rho1d);
+      _drho1d = static_cast<void *>(drho1d);
+  } else {
+    order = -order;
+    rho1d = static_cast<FFT_SCALAR **>(_rho1d);
+    drho1d = static_cast<FFT_SCALAR **>(_drho1d);
+    memory->destroy2d_offset(rho1d,-order/2);
+    memory->destroy2d_offset(drho1d,-order/2);
+  }
+}
+
 /* ----------------------------------------------------------------------
    compute global pair virial via summing F dot r over own & ghost atoms
    at this point, only pairwise forces have been accumulated in atom->f
diff --git a/src/USER-OMP/thr_data.h b/src/USER-OMP/thr_data.h
index 6345a203c7..7fc80a4929 100644
--- a/src/USER-OMP/thr_data.h
+++ b/src/USER-OMP/thr_data.h
@@ -53,7 +53,7 @@ class ThrData {
   void init_eam(int, double *);                       // EAM
   void init_eim(int, double *, double *);             // EIM (+ EAM)
 
-  void init_pppm(void *r1d) { _rho1d = r1d; };
+  void init_pppm(int, class Memory *);
 
   // access methods for arrays that we handle in this class
   double **get_lambda() const { return _lambda; };
@@ -63,6 +63,7 @@ class ThrData {
   double *get_rho() const { return _rho; };
   double *get_rhoB() const { return _rhoB; };
   void *get_rho1d() const { return _rho1d; };
+  void *get_drho1d() const { return _drho1d; };
 
  private:
   double eng_vdwl;        // non-bonded non-coulomb energy
@@ -108,6 +109,7 @@ class ThrData {
 
   // this is for pppm/omp
   void *_rho1d;
+  void *_drho1d;
 
   // my thread id
   const int _tid;
diff --git a/src/USER-OMP/thr_omp.h b/src/USER-OMP/thr_omp.h
index 3c5bbdec3e..79d5c4548c 100644
--- a/src/USER-OMP/thr_omp.h
+++ b/src/USER-OMP/thr_omp.h
@@ -186,4 +186,19 @@ static inline void loop_setup_thr(int &ifrom, int &ito, int &tid,
 }
 
 }
+
+// helpful definitions to help compilers optimizing code better
+
+typedef struct { double x,y,z;   } dbl3_t;
+typedef struct { double x,y,z,w; } dbl4_t;
+typedef struct { int a,b,t;      } int3_t;
+typedef struct { int a,b,c,t;    } int4_t;
+typedef struct { int a,b,c,d,t;  } int5_t;
+
+#if defined(__GNUC__)
+#define _noalias __restrict
+#else
+#define _noalias
+#endif
+
 #endif