From 8b76e47d6bb084cbef50332ef1af0caad5d75854 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Wed, 5 Aug 2020 16:44:56 -0600
Subject: [PATCH] support for tiled decompositions in PPPM

---
 src/KSPACE/gridcomm2.cpp | 1093 ++++++++++++
 src/KSPACE/gridcomm2.h   |  201 +++
 src/KSPACE/pppm2.cpp     | 3524 ++++++++++++++++++++++++++++++++++++++
 src/KSPACE/pppm2.h       |  360 ++++
 src/force.cpp            |    6 +-
 src/kspace.h             |    5 +
 6 files changed, 5186 insertions(+), 3 deletions(-)
 create mode 100644 src/KSPACE/gridcomm2.cpp
 create mode 100644 src/KSPACE/gridcomm2.h
 create mode 100644 src/KSPACE/pppm2.cpp
 create mode 100644 src/KSPACE/pppm2.h

diff --git a/src/KSPACE/gridcomm2.cpp b/src/KSPACE/gridcomm2.cpp
new file mode 100644
index 0000000000..ce9a1e7568
--- /dev/null
+++ b/src/KSPACE/gridcomm2.cpp
@@ -0,0 +1,1093 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "gridcomm2.h"
+#include <mpi.h>
+#include "comm.h"
+#include "kspace.h"
+#include "irregular.h"
+#include "memory.h"
+
+using namespace LAMMPS_NS;
+
+enum{REGULAR,TILED};
+
+#define SWAPDELTA 8
+
+// NOTE: gridcomm needs to be world for TILED, will it work with MSM?
+// NOTE: Tiled implementation here only works for RCB, not general tiled
+
+/* ----------------------------------------------------------------------
+   gcomm = MPI communicator that shares this grid
+           does not have to be world, see MSM
+   gn xyz = size of global grid
+   i xyz lohi = portion of global grid this proc owns, 0 <= index < N
+   o xyz lohi = owned grid portion + ghost grid cells needed in all directions
+   if o indices are < 0 or hi indices are >= N,
+     then grid is treated as periodic in that dimension,
+     communication is done across the periodic boundaries
+------------------------------------------------------------------------- */
+
+GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm,
+		     int gnx, int gny, int gnz,
+		     int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		     int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi)
+  : Pointers(lmp)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+  MPI_Comm_size(gridcomm,&nprocs);
+
+  nx = gnx;
+  ny = gny;
+  nz = gnz;
+  
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  // layout == REGULAR or TILED
+  // for REGULAR, proc xyz lohi = my 6 neighbor procs
+  
+  layout = REGULAR;
+  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
+  
+  outxlo_max = oxlo;
+  outxhi_max = oxhi;
+  outylo_max = oylo;
+  outyhi_max = oyhi;
+  outzlo_max = ozlo;
+  outzhi_max = ozhi;
+
+  if (layout == REGULAR) {
+    int (*procneigh)[2] = comm->procneigh;
+
+    procxlo = procneigh[0][0];
+    procxhi = procneigh[0][1];
+    procylo = procneigh[1][0];
+    procyhi = procneigh[1][1];
+    proczlo = procneigh[2][0];
+    proczhi = procneigh[2][1];
+  }
+  
+  nswap = maxswap = 0;
+  swap = NULL;
+
+  nsend = nrecv = ncopy = 0;
+  send = NULL;
+  recv = NULL;
+  copy = NULL;
+  requests = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   same as first constructor except o xyz lohi max are added arguments
+   this is for case when caller stores grid in a larger array than o xyz lohi
+   only affects indices() method which generates indices into the caller's array
+------------------------------------------------------------------------- */
+
+GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm,
+		     int gnx, int gny, int gnz,
+		     int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+		     int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+		     int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
+		     int ozlo_max, int ozhi_max)
+  : Pointers(lmp)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+  MPI_Comm_size(gridcomm,&nprocs);
+
+  nx = gnx;
+  ny = gny;
+  nz = gnz;
+
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  outxlo_max = oxlo_max;
+  outxhi_max = oxhi_max;
+  outylo_max = oylo_max;
+  outyhi_max = oyhi_max;
+  outzlo_max = ozlo_max;
+  outzhi_max = ozhi_max;
+
+  // layout == REGULAR or TILED
+  // for REGULAR, proc xyz lohi = my 6 neighbor procs
+
+  layout = REGULAR;
+  if (comm->layout == Comm::LAYOUT_TILED) layout = TILED;
+
+  if (layout == REGULAR) {
+    int (*procneigh)[2] = comm->procneigh;
+
+    procxlo = procneigh[0][0];
+    procxhi = procneigh[0][1];
+    procylo = procneigh[1][0];
+    procyhi = procneigh[1][1];
+    proczlo = procneigh[2][0];
+    proczhi = procneigh[2][1];
+  }
+
+  nswap = maxswap = 0;
+  swap = NULL;
+
+  nsend = nrecv = ncopy = 0;
+  send = NULL;
+  recv = NULL;
+  copy = NULL;
+  requests = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+GridComm2::~GridComm2()
+{
+  // regular comm data struct
+  
+  for (int i = 0; i < nswap; i++) {
+    memory->destroy(swap[i].packlist);
+    memory->destroy(swap[i].unpacklist);
+  }
+  memory->sfree(swap);
+
+  // tiled comm data structs
+  
+  for (int i = 0; i < nsend; i++)
+    memory->destroy(send[i].packlist);
+  memory->sfree(send);
+
+  for (int i = 0; i < nrecv; i++)
+    memory->destroy(recv[i].unpacklist);
+  memory->sfree(recv);
+
+  for (int i = 0; i < ncopy; i++) {
+    memory->destroy(copy[i].packlist);
+    memory->destroy(copy[i].unpacklist);
+  }
+  memory->sfree(copy);
+
+  delete [] requests;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::setup(int &nbuf1, int &nbuf2)
+{
+  if (layout == REGULAR) setup_regular(nbuf1,nbuf2);
+  else setup_tiled(nbuf1,nbuf2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::setup_regular(int &nbuf1, int &nbuf2)
+{
+  int nsent,sendfirst,sendlast,recvfirst,recvlast;
+  int sendplanes,recvplanes;
+  int notdoneme,notdone;
+
+  // notify 6 neighbor procs how many ghost grid planes I need from them
+  // ghost xyz lo = # of my lower grid planes that proc xyz lo needs as its ghosts
+  // ghost xyz hi = # of my upper grid planes that proc xyz hi needs as its ghosts
+  // if this proc is its own neighbor across periodic bounary, value is from self
+
+  int nplanes = inxlo - outxlo;
+  if (procxlo != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0,
+                   &ghostxhi,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostxhi = nplanes;
+
+  nplanes = outxhi - inxhi;
+  if (procxhi != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,procxhi,0,
+                   &ghostxlo,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostxlo = nplanes;
+
+  nplanes = inylo - outylo;
+  if (procylo != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,procylo,0,
+                 &ghostyhi,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostyhi = nplanes;
+
+  nplanes = outyhi - inyhi;
+  if (procyhi != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,procyhi,0,
+                 &ghostylo,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostylo = nplanes;
+
+  nplanes = inzlo - outzlo;
+  if (proczlo != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,proczlo,0,
+                 &ghostzhi,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostzhi = nplanes;
+
+  nplanes = outzhi - inzhi;
+  if (proczhi != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0,
+                 &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostzlo = nplanes;
+
+  // setup swaps = exchange of grid data with one of 6 neighobr procs
+  // can be more than one in a direction if ghost region extends beyond neigh proc
+  // all procs have same swap count, but swapsize npack/nunpack can be empty
+  
+  nswap = 0;
+
+  // send own grid pts to -x processor, recv ghost grid pts from +x processor
+
+  nsent = 0;
+  sendfirst = inxlo;
+  sendlast = inxhi;
+  recvfirst = inxhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procxlo;
+    swap[nswap].recvproc = procxhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi);
+
+    if (procxlo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxlo,0,
+                   &recvplanes,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostxlo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +x processor, recv ghost grid pts from -x processor
+
+  nsent = 0;
+  sendfirst = inxlo;
+  sendlast = inxhi;
+  recvlast = inxlo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procxhi;
+    swap[nswap].recvproc = procxlo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi);
+
+    if (procxhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxhi,0,
+                   &recvplanes,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostxhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to -y processor, recv ghost grid pts from +y processor
+
+  nsent = 0;
+  sendfirst = inylo;
+  sendlast = inyhi;
+  recvfirst = inyhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procylo;
+    swap[nswap].recvproc = procyhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi);
+
+    if (procylo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procylo,0,
+                   &recvplanes,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostylo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +y processor, recv ghost grid pts from -y processor
+
+  nsent = 0;
+  sendfirst = inylo;
+  sendlast = inyhi;
+  recvlast = inylo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = procyhi;
+    swap[nswap].recvproc = procylo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi);
+
+    if (procyhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procyhi,0,
+                   &recvplanes,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostyhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to -z processor, recv ghost grid pts from +z processor
+
+  nsent = 0;
+  sendfirst = inzlo;
+  sendlast = inzhi;
+  recvfirst = inzhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = proczlo;
+    swap[nswap].recvproc = proczhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1);
+
+    if (proczlo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczlo,0,
+                   &recvplanes,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostzlo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +z processor, recv ghost grid pts from -z processor
+
+  nsent = 0;
+  sendfirst = inzlo;
+  sendlast = inzhi;
+  recvlast = inzlo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) grow_swap();
+
+    swap[nswap].sendproc = proczhi;
+    swap[nswap].recvproc = proczlo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent);
+    swap[nswap].npack =
+      indices(swap[nswap].packlist,
+              outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast);
+
+    if (proczhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczhi,0,
+                   &recvplanes,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(swap[nswap].unpacklist,
+              outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostzhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // ngrid = max of any forward/reverse pack/unpack grid points
+
+  int ngrid = 0;
+  for (int i = 0; i < nswap; i++) {
+    ngrid = MAX(ngrid,swap[i].npack);
+    ngrid = MAX(ngrid,swap[i].nunpack);
+  }
+
+  nbuf1 = nbuf2 = ngrid;
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm2::setup_tiled(int &nbuf1, int &nbuf2)
+{
+  int i,m;
+  double xlo,xhi,ylo,yhi,zlo,zhi;
+  int ghostbox[6],pbc[3];
+
+  // setup RCB tree of cut info for grid
+  // access CommTiled to get cut dimension
+  // cut = this proc's inlo in that dim
+  // dim is -1 for proc 0, but never accessed
+  
+  rcbinfo = (RCBinfo *)
+    memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo");
+  RCBinfo rcbone;
+  rcbone.dim = comm->rcbcutdim;
+  if (rcbone.dim <= 0) rcbone.cut = inxlo;
+  else if (rcbone.dim == 1) rcbone.cut = inylo;
+  else if (rcbone.dim == 2) rcbone.cut = inzlo;
+  MPI_Allgather(&rcbone,sizeof(RCBinfo),MPI_CHAR,
+                rcbinfo,sizeof(RCBinfo),MPI_CHAR,gridcomm);
+
+  // find overlaps of my extended ghost box with all other procs
+  // accounts for crossings of periodic boundaries
+  // noverlap = # of overlaps, including self
+  // overlap = vector of overlap info using Overlap data struct
+  
+  ghostbox[0] = outxlo;
+  ghostbox[1] = outxhi;
+  ghostbox[2] = outylo;
+  ghostbox[3] = outyhi;
+  ghostbox[4] = outzlo;
+  ghostbox[5] = outzhi;
+  
+  pbc[0] = pbc[1] = pbc[2] = 0;
+
+  memory->create(overlap_procs,nprocs,"GridComm:overlap_procs");
+  noverlap = maxoverlap = 0;
+  overlap = NULL;
+
+  ghost_box_drop(ghostbox,pbc);
+
+  // send each proc an overlap message
+  // content: me, index of my overlap, box that overlaps with its owned cells
+  // ncopy = # of overlaps with myself, across a periodic boundary
+
+  int *proclist;
+  memory->create(proclist,noverlap,"GridComm:proclist");
+  srequest = (Request *)
+    memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest");
+  
+  int nsend_request = 0;
+  ncopy = 0;
+  
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc == me) ncopy++;
+    else {
+      proclist[nsend_request] = overlap[m].proc;
+      srequest[nsend_request].sender = me;
+      srequest[nsend_request].index = m;
+      for (i = 0; i < 6; i++)
+	srequest[nsend_request].box[i] = overlap[m].box[i];
+      nsend_request++;
+    }
+  }
+
+  Irregular *irregular = new Irregular(lmp);
+  int nrecv_request = irregular->create_data(nsend_request,proclist,1);
+  Request *rrequest =
+    (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest");
+  irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
+  irregular->destroy_data();
+  
+  // compute overlaps between received ghost boxes and my owned box
+  // overlap box used to setup my Send data struct and respond to requests
+
+  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridComm:send");
+  sresponse = (Response *)
+    memory->smalloc(nrecv_request*sizeof(Response),"GridComm:sresponse");
+  memory->destroy(proclist);
+  memory->create(proclist,nrecv_request,"GridComm:proclist");
+
+  for (m = 0; m < nrecv_request; m++) {
+    send[m].proc = rrequest[m].sender;
+    xlo = MAX(rrequest[m].box[0],inxlo);
+    xhi = MIN(rrequest[m].box[1],inxhi);
+    ylo = MAX(rrequest[m].box[2],inylo);
+    yhi = MIN(rrequest[m].box[3],inyhi);
+    zlo = MAX(rrequest[m].box[4],inzlo);
+    zhi = MIN(rrequest[m].box[5],inzhi);
+    send[m].npack = indices(send[m].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
+
+    proclist[m] = rrequest[m].sender;
+    sresponse[m].index = rrequest[m].index;
+    sresponse[m].box[0] = xlo;
+    sresponse[m].box[1] = xhi;
+    sresponse[m].box[2] = ylo;
+    sresponse[m].box[3] = yhi;
+    sresponse[m].box[4] = zlo;
+    sresponse[m].box[5] = zhi;
+  }
+
+  nsend = nrecv_request;
+  
+  // reply to each Request message with a Response message
+  // content: index for the overlap on requestor, overlap box on my owned grid
+
+  int nsend_response = nrecv_request;
+  int nrecv_response = irregular->create_data(nsend_response,proclist,1);
+  Response *rresponse =
+    (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridComm:rresponse");
+  irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse);
+  irregular->destroy_data();
+  delete irregular;
+
+  // process received responses
+  // box used to setup my Recv data struct after unwrapping via PBC
+  // adjacent = 0 if any box of ghost cells does not adjoin my owned cells
+  
+  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"CommGrid:recv");
+  adjacent = 1;
+  
+  for (i = 0; i < nrecv_response; i++) {
+    m = rresponse[i].index;
+    recv[i].proc = overlap[m].proc;
+    xlo = rresponse[i].box[0] + overlap[m].pbc[0] * nx;
+    xhi = rresponse[i].box[1] + overlap[m].pbc[0] * nx;
+    ylo = rresponse[i].box[2] + overlap[m].pbc[1] * ny;
+    yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
+    zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz;
+    zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz;
+    recv[i].nunpack = indices(recv[i].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
+    
+    if (xlo != inxhi+1 && xhi != inxlo-1 &&
+	ylo != inyhi+1 && yhi != inylo-1 &&
+	zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0;
+  }
+
+  nrecv = nrecv_response;
+
+  // create Copy data struct from overlaps with self
+  
+  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"CommGrid:copy");
+ 
+  ncopy = 0;
+  for (m = 0; m < noverlap; m++) {
+    if (overlap[m].proc != me) continue;
+    xlo = overlap[m].box[0];
+    xhi = overlap[m].box[1];
+    ylo = overlap[m].box[2];
+    yhi = overlap[m].box[3];
+    zlo = overlap[m].box[4];
+    zhi = overlap[m].box[5];
+    copy[ncopy].npack = indices(copy[ncopy].packlist,xlo,xhi,ylo,yhi,zlo,zhi);
+    xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx;
+    xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx;
+    ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny;
+    yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny;
+    zlo = overlap[m].box[4] + overlap[m].pbc[2] * nz;
+    zhi = overlap[m].box[5] + overlap[m].pbc[2] * nz;
+    copy[ncopy].nunpack = indices(copy[ncopy].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi);
+    ncopy++;
+  }
+
+  // set offsets for received data
+
+  int offset = 0;
+  for (m = 0; m < nsend; m++) {
+    send[m].offset = offset;
+    offset += send[m].npack;
+  }
+
+  offset = 0;
+  for (m = 0; m < nrecv; m++) {
+    recv[m].offset = offset;
+    offset += recv[m].nunpack;
+  }
+
+  // length of MPI requests vector is max of nsend, nrecv
+
+  int nrequest = MAX(nsend,nrecv);
+  requests = new MPI_Request[nrequest];
+    
+  // clean-up
+
+  memory->sfree(rcbinfo);
+  memory->destroy(proclist);
+  memory->destroy(overlap_procs);
+  memory->sfree(overlap);
+  memory->sfree(srequest);
+  memory->sfree(rrequest);
+  memory->sfree(sresponse);
+  memory->sfree(rresponse);
+
+  // nbuf1 = largest pack or unpack in any Send or Recv or Copy
+  // nbuf2 = larget of sum of all packs or unpacks in Send or Recv
+  
+  nbuf1 = 0;
+
+  for (m = 0; m < ncopy; m++) {
+    nbuf1 = MAX(nbuf1,copy[m].npack);
+    nbuf1 = MAX(nbuf1,copy[m].nunpack);
+  }
+
+  int nbufs = 0;
+  for (m = 0; m < nsend; m++) {
+    nbuf1 = MAX(nbuf1,send[m].npack);
+    nbufs += send[m].npack;
+  }
+
+  int nbufr = 0;
+  for (m = 0; m < nrecv; m++) {
+    nbuf1 = MAX(nbuf1,recv[m].nunpack);
+    nbufr += recv[m].nunpack;
+  }
+
+  nbuf2 = MAX(nbufs,nbufr);
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm2::ghost_box_drop(int *box, int *pbc)
+{
+  int i,m;
+  
+  // newbox12 and newpbc are initially copies of caller box and pbc
+  
+  int newbox1[6],newbox2[6],newpbc[3];
+
+  for (i = 0; i < 6; i++) newbox1[i] = newbox2[i] = box[i];
+  for (i = 0; i < 3; i++) newpbc[i] = pbc[i];
+
+  // 6 if tests to see if box needs to be split across a periodic boundary
+  // final else is no split
+  
+  int splitflag = 1;
+  
+  if (box[0] < 0) {
+    newbox1[0] = 0;
+    newbox2[0] = box[0] + nx;
+    newbox2[1] = nx - 1;
+    newpbc[0]--;
+  } else if (box[1] >= nx) {
+    newbox1[1] = nx - 1;
+    newbox2[0] = 0;
+    newbox2[1] = box[1] - nx;
+    newpbc[0]++;
+  } else if (box[2] < 0) {
+    newbox1[2] = 0;
+    newbox2[2] = box[2] + ny;
+    newbox2[3] = ny - 1;
+    newpbc[1]--;
+  } else if (box[3] >= ny) {
+    newbox1[3] = ny - 1;
+    newbox2[2] = 0;
+    newbox2[3] = box[3] - ny; 
+    newpbc[1]++;
+  } else if (box[4] < 0) {
+    newbox1[4] = 0;
+    newbox2[4] = box[4] + nz;
+    newbox2[5] = nz - 1;
+    newpbc[2]--;
+  } else if (box[5] >= nz) {
+    newbox1[5] = nz - 1;
+    newbox2[4] = 0;
+    newbox2[5] = box[5] - nz;
+    newpbc[2]++;
+
+  // box is not split, drop on RCB tree
+  // returns nprocs = # of procs it overlaps, including self
+  // returns proc_overlap = list of proc IDs it overlaps
+  // skip self overlap if no crossing of periodic boundaries
+    
+  } else {
+    splitflag = 0;
+    int np = 0;
+    box_drop_grid(box,0,nprocs-1,np,overlap_procs);
+    for (m = 0; m < np; m++) {
+      if (noverlap == maxoverlap) grow_overlap();
+      if (overlap_procs[m] == me &&
+	  pbc[0] == 0 && pbc[1] == 0 && pbc[2] == 0) continue;
+      overlap[noverlap].proc = overlap_procs[m];
+      for (i = 0; i < 6; i++) overlap[noverlap].box[i] = box[i];
+      for (i = 0; i < 3; i++) overlap[noverlap].pbc[i] = pbc[i];
+      noverlap++;
+    }
+  }
+
+  // recurse with 2 split boxes
+  
+  if (splitflag) {
+    ghost_box_drop(newbox1,pbc);
+    ghost_box_drop(newbox2,newpbc);
+  }
+}
+
+/* ----------------------------------------------------------------------
+------------------------------------------------------------------------- */
+
+void GridComm2::box_drop_grid(int *box, int proclower, int procupper,
+			      int &np, int *plist)
+{
+  // end recursion when partition is a single proc
+  // add proclower to plist
+
+  if (proclower == procupper) {
+    plist[np++] = proclower;
+    return;
+  }
+
+  // drop box on each side of cut it extends beyond
+  // use < and >= criteria so does not include a box it only touches
+  // procmid = 1st processor in upper half of partition
+  //         = location in tree that stores this cut
+  // cut = index of first grid cell in upper partition
+  // dim = 0,1,2 dimension of cut
+
+  int procmid = proclower + (procupper - proclower) / 2 + 1;
+  int dim = rcbinfo[procmid].dim;
+  int cut = rcbinfo[procmid].cut;
+
+  if (box[2*dim] < cut) box_drop_grid(box,proclower,procmid-1,np,plist);
+  if (box[2*dim+1] >= cut) box_drop_grid(box,procmid,procupper,np,plist);
+}
+
+/* ----------------------------------------------------------------------
+   check if all procs only need ghost info from adjacent procs
+   return 1 if yes, 0 if no
+------------------------------------------------------------------------- */
+
+int GridComm2::ghost_adjacent()
+{
+  if (layout == REGULAR) return ghost_adjacent_regular();
+  return ghost_adjacent_tiled();
+}
+
+/* ----------------------------------------------------------------------
+   adjacent = 0 if a proc's ghost xyz lohi values exceed its subdomain size
+   return 0 if adjacent=0 for any proc, else 1
+------------------------------------------------------------------------- */
+
+int GridComm2::ghost_adjacent_regular()
+{
+  adjacent = 1;
+  if (ghostxlo > inxhi-inxlo+1) adjacent = 0;
+  if (ghostxhi > inxhi-inxlo+1) adjacent = 0;
+  if (ghostylo > inyhi-inylo+1) adjacent = 0;
+  if (ghostyhi > inyhi-inylo+1) adjacent = 0;
+  if (ghostzlo > inzhi-inzlo+1) adjacent = 0;
+  if (ghostzhi > inzhi-inzlo+1) adjacent = 0;
+
+  int adjacent_all;
+  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
+  return adjacent_all;
+}
+
+/* ----------------------------------------------------------------------
+   adjacent = 0 if a proc's received ghosts were flagged
+     as non-adjacent in setup_tiled()
+   return 0 if adjacent=0 for any proc, else 1
+------------------------------------------------------------------------- */
+
+int GridComm2::ghost_adjacent_tiled()
+{
+  int adjacent_all;
+  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
+  return adjacent_all;
+}
+
+/* ----------------------------------------------------------------------
+   use swap list in forward order to acquire copy of all needed ghost grid pts
+------------------------------------------------------------------------- */
+
+void GridComm2::forward_comm(KSpace *kspace, int nper, int nbyte, int which,
+			     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  if (layout == REGULAR)
+    forward_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+  else
+    forward_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::forward_comm_regular(KSpace *kspace, int nper, int nbyte, int which,
+				     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
+
+  for (m = 0; m < nswap; m++) {
+    if (swap[m].sendproc == me)
+      kspace->pack_forward2(which,buf2,swap[m].npack,swap[m].packlist);
+    else
+      kspace->pack_forward2(which,buf1,swap[m].npack,swap[m].packlist);
+
+    if (swap[m].sendproc != me) {
+      if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm,&request);
+      if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
+				  swap[m].sendproc,0,gridcomm);
+      if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
+    }
+
+    kspace->unpack_forward2(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::forward_comm_tiled(KSpace *kspace, int nper, int nbyte, int which,
+				   void *buf1, void *vbuf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  char *buf2 = (char *) vbuf2;
+  
+  // post all receives
+  
+  for (m = 0; m < nrecv; m++) {
+    offset = nper * recv[m].offset * nbyte;
+    MPI_Irecv((void *) &buf2[offset],nper*recv[m].nunpack,datatype,
+	      recv[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nsend; m++) {
+    kspace->pack_forward2(which,buf1,send[m].npack,send[m].packlist);
+    MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    kspace->pack_forward2(which,buf1,copy[m].npack,copy[m].packlist);
+    kspace->unpack_forward2(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nrecv; i++) {
+    MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
+    offset = nper * recv[m].offset * nbyte;
+    kspace->unpack_forward2(which,(void *) &buf2[offset],
+			    recv[m].nunpack,recv[m].unpacklist);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   use swap list in reverse order to compute fully summed value
+   for each owned grid pt that some other proc has copy of as a ghost grid pt
+------------------------------------------------------------------------- */
+
+void GridComm2::reverse_comm(KSpace *kspace, int nper, int nbyte, int which,
+			     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  if (layout == REGULAR)
+    reverse_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype);
+  else
+    reverse_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::reverse_comm_regular(KSpace *kspace, int nper, int nbyte, int which,
+				     void *buf1, void *buf2, MPI_Datatype datatype)
+{
+  int m;
+  MPI_Request request;
+
+  for (m = nswap-1; m >= 0; m--) {
+    if (swap[m].recvproc == me)
+      kspace->pack_reverse2(which,buf2,swap[m].nunpack,swap[m].unpacklist);
+    else
+      kspace->pack_reverse2(which,buf1,swap[m].nunpack,swap[m].unpacklist);
+
+    if (swap[m].recvproc != me) {
+      if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
+				   swap[m].sendproc,0,gridcomm,&request);
+      if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
+				     swap[m].recvproc,0,gridcomm);
+      if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
+    }
+
+    kspace->unpack_reverse2(which,buf2,swap[m].npack,swap[m].packlist);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void GridComm2::reverse_comm_tiled(KSpace *kspace, int nper, int nbyte, int which,
+				   void *buf1, void *vbuf2, MPI_Datatype datatype)
+{
+  int i,m,offset;
+
+  char *buf2 = (char *) vbuf2;
+
+  // post all receives
+  
+  for (m = 0; m < nsend; m++) {
+    offset = nper * send[m].offset * nbyte;
+    MPI_Irecv((void *) &buf2[offset],nper*send[m].npack,datatype,
+	      send[m].proc,0,gridcomm,&requests[m]);
+  }
+
+  // perform all sends to other procs
+
+  for (m = 0; m < nrecv; m++) {
+    kspace->pack_reverse2(which,buf1,recv[m].nunpack,recv[m].unpacklist);
+    MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm);
+  }
+
+  // perform all copies to self
+
+  for (m = 0; m < ncopy; m++) {
+    kspace->pack_reverse2(which,buf1,copy[m].nunpack,copy[m].unpacklist);
+    kspace->unpack_reverse2(which,buf1,copy[m].npack,copy[m].packlist);
+  }
+
+  // unpack all received data
+  
+  for (i = 0; i < nsend; i++) {
+    MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
+    offset = nper * send[m].offset * nbyte;
+    kspace->unpack_reverse2(which,(void *) &buf2[offset],
+			    send[m].npack,send[m].packlist);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
+------------------------------------------------------------------------- */
+
+void GridComm2::grow_swap()
+{
+  maxswap += SWAPDELTA;
+  swap = (Swap *)
+    memory->srealloc(swap,maxswap*sizeof(Swap),"CommGrid:swap");
+}
+
+/* ----------------------------------------------------------------------
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
+------------------------------------------------------------------------- */
+
+void GridComm2::grow_overlap()
+{
+  maxoverlap += SWAPDELTA;
+  overlap = (Overlap *)
+    memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"CommGrid:overlap");
+}
+
+/* ----------------------------------------------------------------------
+   create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
+   assume 3d array is allocated as (outxlo_max:outxhi_max,outylo_max:outyhi_max,
+     outzlo_max:outzhi_max)
+------------------------------------------------------------------------- */
+
+int GridComm2::indices(int *&list,
+                       int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
+{
+  int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
+  memory->create(list,nmax,"CommGrid:indices");
+  if (nmax == 0) return 0;
+
+  int nx = (outxhi_max-outxlo_max+1);
+  int ny = (outyhi_max-outylo_max+1);
+
+  int n = 0;
+  int ix,iy,iz;
+  for (iz = zlo; iz <= zhi; iz++)
+    for (iy = ylo; iy <= yhi; iy++)
+      for (ix = xlo; ix <= xhi; ix++)
+        list[n++] = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max);
+
+  return nmax;
+}
diff --git a/src/KSPACE/gridcomm2.h b/src/KSPACE/gridcomm2.h
new file mode 100644
index 0000000000..eeba990d2d
--- /dev/null
+++ b/src/KSPACE/gridcomm2.h
@@ -0,0 +1,201 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_GRIDCOMM2_H
+#define LMP_GRIDCOMM2_H
+
+#include "pointers.h"
+
+namespace LAMMPS_NS {
+
+class GridComm2 : protected Pointers {
+ public:
+  GridComm2(class LAMMPS *, MPI_Comm, int, int, int,
+	    int, int, int, int, int, int,
+	    int, int, int, int, int, int);
+  GridComm2(class LAMMPS *, MPI_Comm, int, int, int,
+	    int, int, int, int, int, int,
+	    int, int, int, int, int, int,
+	    int, int, int, int, int, int);
+  ~GridComm2();
+  void setup(int &, int &);
+  int ghost_adjacent();
+  void forward_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype);
+  void reverse_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype);
+
+ private:
+  int me,nprocs;
+  int layout;                 // REGULAR or TILED
+  MPI_Comm gridcomm;
+
+  // inputs from caller via constructor
+
+  int nx,ny,nz;               // size of global grid in all 3 dims
+  int inxlo,inxhi;            // inclusive extent of my grid chunk
+  int inylo,inyhi;            //   0 <= in <= N-1
+  int inzlo,inzhi;   
+  int outxlo,outxhi;          // inclusive extent of my grid chunk plus
+  int outylo,outyhi;          //   ghost cells in all 6 directions
+  int outzlo,outzhi;          // lo indices can be < 0, hi indices can be >= N
+  int outxlo_max,outxhi_max;  // ??
+  int outylo_max,outyhi_max;
+  int outzlo_max,outzhi_max;
+
+  // -------------------------------------------
+  // internal variables for REGULAR layout
+  // -------------------------------------------
+
+  int procxlo,procxhi;     // 6 neighbor procs that adjoin me
+  int procylo,procyhi;     //   not used for comm_style = tiled
+  int proczlo,proczhi;
+  
+  int ghostxlo,ghostxhi;   // # of my owned grid planes needed
+  int ghostylo,ghostyhi;   //   by neighobr procs in each dir as their ghost planes
+  int ghostzlo,ghostzhi;
+
+  // swap = exchange of owned and ghost grid cells between 2 procs, including self
+  
+  struct Swap {
+    int sendproc;       // proc to send to for forward comm
+    int recvproc;       // proc to recv from for forward comm
+    int npack;          // # of datums to pack
+    int nunpack;        // # of datums to unpack
+    int *packlist;      // 3d array offsets to pack
+    int *unpacklist;    // 3d array offsets to unpack
+  };
+
+  int nswap,maxswap;
+  Swap *swap;
+
+  // -------------------------------------------
+  // internal variables for TILED layout
+  // -------------------------------------------
+
+  int *overlap_procs;
+  MPI_Request *requests;
+
+  // RCB tree of cut info
+  // each proc contributes one value, except proc 0
+  
+  struct RCBinfo {
+    int dim;        // 0,1,2 = which dim the cut is in
+    int cut;        // grid index of lowest cell in upper half of cut
+  };
+
+  RCBinfo *rcbinfo;
+    
+  // overlap = a proc whose owned cells overlap with my extended ghost box
+  // includes overlaps across periodic boundaries, can also be self
+  
+  struct Overlap {
+    int proc;            // proc whose owned cells overlap my ghost cells
+    int box[6];          // box that overlaps otherproc's owned cells
+                         // this box is wholly contained within global grid
+    int pbc[3];          // PBC offsets to convert box to a portion of my ghost box
+                         // my ghost box may extend beyond global grid
+  };
+
+  int noverlap,maxoverlap;
+  Overlap *overlap;
+  
+  // request = sent to each proc whose owned cells overlap my ghost cells
+  
+  struct Request {
+    int sender;          // sending proc
+    int index;           // index of overlap on sender
+    int box[6];          // box that overlaps receiver's owned cells
+                         // wholly contained within global grid
+  };
+
+  Request *srequest,*rrequest;
+  
+  // response = reply from each proc whose owned cells overlap my ghost cells
+  
+  struct Response {
+    int index;           // index of my overlap for the initial request
+    int box[6];          // box that overlaps responder's owned cells
+                         // wholly contained within global grid
+                         // has to unwrapped by PBC to map to my ghost cells
+  };
+
+  Response *sresponse,*rresponse;
+  
+  // send = proc to send a subset of my owned cells to, for forward comm
+  // for reverse comm, proc I receive ghost overlaps with my owned cells from
+  // offset used in reverse comm to recv a message in middle of a large buffer
+
+  struct Send {
+    int proc;
+    int npack;
+    int *packlist;
+    int offset;
+  };
+
+  // recv = proc to recv a subset of my ghost cells from, for forward comm
+  // for reverse comm, proc I send a subset of my ghost cells to
+  // offset used in forward comm to recv a message in middle of a large buffer
+  
+  struct Recv {
+    int proc;
+    int nunpack;
+    int *unpacklist;
+    int offset;
+  };
+
+  int adjacent;      // 0 on a proc who receives ghosts from a non-neighbor proc
+
+  // copy = subset of my owned cells to copy into subset of my ghost cells
+  // that describes forward comm, for reverse comm it is the opposite
+  
+  struct Copy {
+    int npack;
+    int nunpack;
+    int *packlist;
+    int *unpacklist;
+  };
+
+  int nsend,nrecv,ncopy;
+  Send *send;
+  Recv *recv;
+  Copy *copy;
+
+  // -------------------------------------------
+  // internal methods
+  // -------------------------------------------
+  
+  void setup_regular(int &, int &);
+  void setup_tiled(int &, int &);
+  void ghost_box_drop(int *, int *);
+  void box_drop_grid(int *, int, int, int &, int *);
+  
+  int ghost_adjacent_regular();
+  int ghost_adjacent_tiled();
+  
+  void forward_comm_regular(class KSpace *, int, int, int,
+			    void *, void *, MPI_Datatype);
+  void forward_comm_tiled(class KSpace *, int, int, int,
+			  void *, void *, MPI_Datatype);
+  void reverse_comm_regular(class KSpace *, int, int, int,
+			    void *, void *, MPI_Datatype);
+  void reverse_comm_tiled(class KSpace *, int, int, int,
+			  void *, void *, MPI_Datatype);
+
+  void grow_swap();
+  void grow_overlap();
+  
+  int indices(int *&, int, int, int, int, int, int);
+};
+
+}
+
+#endif
diff --git a/src/KSPACE/pppm2.cpp b/src/KSPACE/pppm2.cpp
new file mode 100644
index 0000000000..19baa13f16
--- /dev/null
+++ b/src/KSPACE/pppm2.cpp
@@ -0,0 +1,3524 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
+     per-atom energy/virial & group/group energy/force added by Stan Moore (BYU)
+     analytic diff (2 FFT) option added by Rolf Isele-Holder (Aachen University)
+     triclinic added by Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include "pppm2.h"
+#include <mpi.h>
+#include <cstring>
+#include <string>
+#include <cmath>
+#include "atom.h"
+#include "comm.h"
+#include "gridcomm2.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "utils.h"
+#include "fmt/format.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define LARGE 10000.0
+#define SMALL 0.00001
+#define EPS_HOC 1.0e-7
+
+enum{REVERSE_RHO};
+enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPM2::PPPM2(LAMMPS *lmp) : KSpace(lmp),
+  factors(NULL), density_brick(NULL), vdx_brick(NULL), vdy_brick(NULL), vdz_brick(NULL),
+  u_brick(NULL), v0_brick(NULL), v1_brick(NULL), v2_brick(NULL), v3_brick(NULL),
+  v4_brick(NULL), v5_brick(NULL), greensfn(NULL), vg(NULL), fkx(NULL), fky(NULL),
+  fkz(NULL), density_fft(NULL), work1(NULL), work2(NULL), gf_b(NULL), rho1d(NULL),
+  rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL),
+  sf_precoeff1(NULL), sf_precoeff2(NULL), sf_precoeff3(NULL),
+  sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL),
+  acons(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL),
+  density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL),
+  gc_buf1(NULL), gc_buf2(NULL), part2grid(NULL), boxlo(NULL)
+{
+  peratom_allocate_flag = 0;
+  group_allocate_flag = 0;
+
+  pppmflag = 1;
+  group_group_enable = 1;
+  triclinic = domain->triclinic;
+
+  nfactors = 3;
+  factors = new int[nfactors];
+  factors[0] = 2;
+  factors[1] = 3;
+  factors[2] = 5;
+
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  nfft_both = 0;
+  nxhi_in = nxlo_in = nxhi_out = nxlo_out = 0;
+  nyhi_in = nylo_in = nyhi_out = nylo_out = 0;
+  nzhi_in = nzlo_in = nzhi_out = nzlo_out = 0;
+
+  density_brick = vdx_brick = vdy_brick = vdz_brick = NULL;
+  density_fft = NULL;
+  u_brick = NULL;
+  v0_brick = v1_brick = v2_brick = v3_brick = v4_brick = v5_brick = NULL;
+  greensfn = NULL;
+  work1 = work2 = NULL;
+  vg = NULL;
+  fkx = fky = fkz = NULL;
+
+  sf_precoeff1 = sf_precoeff2 = sf_precoeff3 =
+    sf_precoeff4 = sf_precoeff5 = sf_precoeff6 = NULL;
+
+  density_A_brick = density_B_brick = NULL;
+  density_A_fft = density_B_fft = NULL;
+
+  gf_b = NULL;
+  rho1d = rho_coeff = drho1d = drho_coeff = NULL;
+
+  fft1 = fft2 = NULL;
+  remap = NULL;
+  gc = NULL;
+  gc_buf1 = gc_buf2 = NULL;
+
+  nmax = 0;
+  part2grid = NULL;
+
+  // define acons coefficients for estimation of kspace errors
+  // see JCP 109, pg 7698 for derivation of coefficients
+  // higher order coefficients may be computed if needed
+
+  memory->create(acons,8,7,"pppm:acons");
+  acons[1][0] = 2.0 / 3.0;
+  acons[2][0] = 1.0 / 50.0;
+  acons[2][1] = 5.0 / 294.0;
+  acons[3][0] = 1.0 / 588.0;
+  acons[3][1] = 7.0 / 1440.0;
+  acons[3][2] = 21.0 / 3872.0;
+  acons[4][0] = 1.0 / 4320.0;
+  acons[4][1] = 3.0 / 1936.0;
+  acons[4][2] = 7601.0 / 2271360.0;
+  acons[4][3] = 143.0 / 28800.0;
+  acons[5][0] = 1.0 / 23232.0;
+  acons[5][1] = 7601.0 / 13628160.0;
+  acons[5][2] = 143.0 / 69120.0;
+  acons[5][3] = 517231.0 / 106536960.0;
+  acons[5][4] = 106640677.0 / 11737571328.0;
+  acons[6][0] = 691.0 / 68140800.0;
+  acons[6][1] = 13.0 / 57600.0;
+  acons[6][2] = 47021.0 / 35512320.0;
+  acons[6][3] = 9694607.0 / 2095994880.0;
+  acons[6][4] = 733191589.0 / 59609088000.0;
+  acons[6][5] = 326190917.0 / 11700633600.0;
+  acons[7][0] = 1.0 / 345600.0;
+  acons[7][1] = 3617.0 / 35512320.0;
+  acons[7][2] = 745739.0 / 838397952.0;
+  acons[7][3] = 56399353.0 / 12773376000.0;
+  acons[7][4] = 25091609.0 / 1560084480.0;
+  acons[7][5] = 1755948832039.0 / 36229939200000.0;
+  acons[7][6] = 4887769399.0 / 37838389248.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPM2::settings(int narg, char **arg)
+{
+  if (narg < 1) error->all(FLERR,"Illegal kspace_style pppm command");
+  accuracy_relative = fabs(force->numeric(FLERR,arg[0]));
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+PPPM2::~PPPM2()
+{
+  if (copymode) return;
+
+  delete [] factors;
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  if (group_allocate_flag) deallocate_groups();
+  memory->destroy(part2grid);
+  memory->destroy(acons);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void PPPM2::init()
+{
+  if (me == 0) utils::logmesg(lmp,"PPPM initialization ...\n");
+
+  // error check
+
+  triclinic_check();
+
+  if (triclinic != domain->triclinic)
+    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");
+
+  if (domain->triclinic && differentiation_flag == 1)
+    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box "
+               "and kspace_modify diff ad");
+  if (domain->triclinic && slabflag)
+    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and "
+               "slab correction");
+  if (domain->dimension == 2)
+    error->all(FLERR,"Cannot use PPPM with 2d simulation");
+
+  if (!atom->q_flag)
+    error->all(FLERR,"Kspace style requires atom attribute q");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use non-periodic boundaries with PPPM");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPM");
+  }
+
+  if (order < 2 || order > MAXORDER)
+    error->all(FLERR,fmt::format("PPPM order cannot be < 2 or > {}",MAXORDER));
+
+  // compute two charge force
+
+  two_charge();
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  pair_check();
+
+  int itmp = 0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // if kspace is TIP4P, extract TIP4P params from pair style
+  // bond/angle are not yet init(), so insure equilibrium request is valid
+
+  qdist = 0.0;
+
+  if (tip4pflag) {
+    if (me == 0) utils::logmesg(lmp,"  extracting TIP4P info from pair style\n");
+
+    double *p_qdist = (double *) force->pair->extract("qdist",itmp);
+    int *p_typeO = (int *) force->pair->extract("typeO",itmp);
+    int *p_typeH = (int *) force->pair->extract("typeH",itmp);
+    int *p_typeA = (int *) force->pair->extract("typeA",itmp);
+    int *p_typeB = (int *) force->pair->extract("typeB",itmp);
+    if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB)
+      error->all(FLERR,"Pair style is incompatible with TIP4P KSpace style");
+    qdist = *p_qdist;
+    typeO = *p_typeO;
+    typeH = *p_typeH;
+    int typeA = *p_typeA;
+    int typeB = *p_typeB;
+
+    if (force->angle == NULL || force->bond == NULL ||
+        force->angle->setflag == NULL || force->bond->setflag == NULL)
+      error->all(FLERR,"Bond and angle potentials must be defined for TIP4P");
+    if (typeA < 1 || typeA > atom->nangletypes ||
+        force->angle->setflag[typeA] == 0)
+      error->all(FLERR,"Bad TIP4P angle type for PPPM/TIP4P");
+    if (typeB < 1 || typeB > atom->nbondtypes ||
+        force->bond->setflag[typeB] == 0)
+      error->all(FLERR,"Bad TIP4P bond type for PPPM/TIP4P");
+    double theta = force->angle->equilibrium_angle(typeA);
+    double blen = force->bond->equilibrium_distance(typeB);
+    alpha = qdist / (cos(0.5*theta) * blen);
+  }
+
+  // compute qsum & qsqsum and warn if not charge-neutral
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  qsum_qsq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  if (group_allocate_flag) deallocate_groups();
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil does not extend beyond neighbor proc
+  //   or overlap is allowed, then done
+  // else reduce order and try again
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  GridComm2 *gctmp = NULL;
+  int iteration = 0;
+
+  while (order >= minorder) {
+    if (iteration && me == 0)
+      error->warning(FLERR,"Reducing PPPM order b/c stencil extends "
+                     "beyond nearest neighbor processor");
+
+    if (stagger_flag && !differentiation_flag) compute_gf_denom();
+    set_grid_global();
+    set_grid_local();
+    if (overlap_allowed) break;
+
+    gctmp = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+			  nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+			  nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+    int tmp1,tmp2;
+    gctmp->setup(tmp1,tmp2);
+    if (gctmp->ghost_adjacent()) break;
+    delete gctmp;
+
+    order--;
+    iteration++;
+  }
+
+  if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
+  if (!overlap_allowed && !gctmp->ghost_adjacent())
+    error->all(FLERR,"PPPM grid stencil extends "
+               "beyond nearest neighbor processor");
+  if (gctmp) delete gctmp;
+
+  // adjust g_ewald
+
+  if (!gewaldflag) adjust_gewald();
+
+  // calculate the final accuracy
+
+  double estimated_accuracy = final_accuracy();
+
+  // print stats
+
+  int ngrid_max,nfft_both_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+    std::string mesg = fmt::format("  G vector (1/distance) = {:.8g}\n",g_ewald);
+    mesg += fmt::format("  grid = {} {} {}\n",nx_pppm,ny_pppm,nz_pppm);
+    mesg += fmt::format("  stencil order = {}\n",order);
+    mesg += fmt::format("  estimated absolute RMS force accuracy = {:.8g}\n",
+                       estimated_accuracy);
+    mesg += fmt::format("  estimated relative force accuracy = {:.8g}\n",
+                       estimated_accuracy/two_charge_force);
+    mesg += "  using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n";
+    mesg += fmt::format("  3d grid and FFT values/proc = {} {}\n",
+                       ngrid_max,nfft_both_max);
+    utils::logmesg(lmp,mesg);
+  }
+
+  // allocate K-space dependent memory
+  // don't invoke allocate peratom() or group(), will be allocated when needed
+
+  allocate();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  if (differentiation_flag == 1) compute_sf_precoeff();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void PPPM2::setup()
+{
+  if (triclinic) {
+    setup_triclinic();
+    return;
+  }
+
+  // perform some checks to avoid illegal boundaries with read_data
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use non-periodic boundaries with PPPM");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPM");
+  }
+
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  double unitkx = (MY_2PI/xprd);
+  double unitky = (MY_2PI/yprd);
+  double unitkz = (MY_2PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per;
+
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+        if (sqk == 0.0) {
+          vg[n][0] = 0.0;
+          vg[n][1] = 0.0;
+          vg[n][2] = 0.0;
+          vg[n][3] = 0.0;
+          vg[n][4] = 0.0;
+          vg[n][5] = 0.0;
+        } else {
+          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+          vg[n][3] = vterm*fkx[i]*fky[j];
+          vg[n][4] = vterm*fkx[i]*fkz[k];
+          vg[n][5] = vterm*fky[j]*fkz[k];
+        }
+        n++;
+      }
+    }
+  }
+
+  if (differentiation_flag == 1) compute_gf_ad();
+  else compute_gf_ik();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM2::setup_triclinic()
+{
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  prd = domain->prd;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  // use lamda (0-1) coordinates
+
+  delxinv = nx_pppm;
+  delyinv = ny_pppm;
+  delzinv = nz_pppm;
+  delvolinv = delxinv*delyinv*delzinv/volume;
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per_i,per_j,per_k;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    per_k = k - nz_pppm*(2*k/nz_pppm);
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      per_j = j - ny_pppm*(2*j/ny_pppm);
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        per_i = i - nx_pppm*(2*i/nx_pppm);
+
+        double unitk_lamda[3];
+        unitk_lamda[0] = 2.0*MY_PI*per_i;
+        unitk_lamda[1] = 2.0*MY_PI*per_j;
+        unitk_lamda[2] = 2.0*MY_PI*per_k;
+        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+        fkx[n] = unitk_lamda[0];
+        fky[n] = unitk_lamda[1];
+        fkz[n] = unitk_lamda[2];
+        n++;
+      }
+    }
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  for (n = 0; n < nfft; n++) {
+    sqk = fkx[n]*fkx[n] + fky[n]*fky[n] + fkz[n]*fkz[n];
+    if (sqk == 0.0) {
+      vg[n][0] = 0.0;
+      vg[n][1] = 0.0;
+      vg[n][2] = 0.0;
+      vg[n][3] = 0.0;
+      vg[n][4] = 0.0;
+      vg[n][5] = 0.0;
+    } else {
+      vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+      vg[n][0] = 1.0 + vterm*fkx[n]*fkx[n];
+      vg[n][1] = 1.0 + vterm*fky[n]*fky[n];
+      vg[n][2] = 1.0 + vterm*fkz[n]*fkz[n];
+      vg[n][3] = vterm*fkx[n]*fky[n];
+      vg[n][4] = vterm*fkx[n]*fkz[n];
+      vg[n][5] = vterm*fky[n]*fkz[n];
+    }
+  }
+
+  compute_gf_ik_triclinic();
+}
+
+/* ----------------------------------------------------------------------
+   reset local grid arrays and communication stencils
+   called by fix balance b/c it changed sizes of processor sub-domains
+------------------------------------------------------------------------- */
+
+void PPPM2::setup_grid()
+{
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  if (group_allocate_flag) deallocate_groups();
+
+  // reset portion of global grid that each proc owns
+
+  set_grid_local();
+
+  // reallocate K-space dependent memory
+  // check if grid communication is now overlapping if not allowed
+  // don't invoke allocate peratom() or group(), will be allocated when needed
+
+  allocate();
+
+  if (!overlap_allowed && !gc->ghost_adjacent())
+    error->all(FLERR,"PPPM grid stencil extends "
+               "beyond nearest neighbor processor");
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  if (differentiation_flag == 1) compute_sf_precoeff();
+  compute_rho_coeff();
+
+  // pre-compute volume-dependent coeffs for portion of grid I now own
+
+  setup();
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPM long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPM2::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  ev_init(eflag,vflag);
+
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    qsum_qsq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no charges
+
+  if (qsqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm:part2grid");
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid
+
+  particle_map();
+  make_rho();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atom calculations via poisson_peratom()
+
+  poisson();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  if (differentiation_flag == 1)
+    gc->forward_comm(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+		     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+		     gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) {
+    if (differentiation_flag == 1 && vflag_atom)
+      gc->forward_comm(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+		       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    else if (differentiation_flag == 0)
+      gc->forward_comm(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+		       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  }
+
+  // calculate the force on my particles
+
+  fieldforce();
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/MY_PIS +
+      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+  // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms
+
+  if (evflag_atom) {
+    double *q = atom->q;
+    int nlocal = atom->nlocal;
+    int ntotal = nlocal;
+    if (tip4pflag) ntotal += atom->nghost;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] *= 0.5;
+        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
+          (g_ewald*g_ewald*volume);
+        eatom[i] *= qscale;
+      }
+      for (i = nlocal; i < ntotal; i++) eatom[i] *= 0.5*qscale;
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < ntotal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::allocate()
+{
+  memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_brick");
+
+  memory->create(density_fft,nfft_both,"pppm:density_fft");
+  memory->create(greensfn,nfft_both,"pppm:greensfn");
+  memory->create(work1,2*nfft_both,"pppm:work1");
+  memory->create(work2,2*nfft_both,"pppm:work2");
+  memory->create(vg,nfft_both,6,"pppm:vg");
+
+  if (triclinic == 0) {
+    memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
+    memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
+    memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
+  } else {
+    memory->create(fkx,nfft_both,"pppm:fkx");
+    memory->create(fky,nfft_both,"pppm:fky");
+    memory->create(fkz,nfft_both,"pppm:fkz");
+  }
+
+  if (differentiation_flag == 1) {
+    memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:u_brick");
+
+    memory->create(sf_precoeff1,nfft_both,"pppm:sf_precoeff1");
+    memory->create(sf_precoeff2,nfft_both,"pppm:sf_precoeff2");
+    memory->create(sf_precoeff3,nfft_both,"pppm:sf_precoeff3");
+    memory->create(sf_precoeff4,nfft_both,"pppm:sf_precoeff4");
+    memory->create(sf_precoeff5,nfft_both,"pppm:sf_precoeff5");
+    memory->create(sf_precoeff6,nfft_both,"pppm:sf_precoeff6");
+
+  } else {
+    memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:vdx_brick");
+    memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:vdy_brick");
+    memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:vdz_brick");
+  }
+
+  // summation coeffs
+
+  order_allocated = order;
+  if (!stagger_flag) memory->create(gf_b,order,"pppm:gf_b");
+  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
+  memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d");
+  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
+  memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
+                          "pppm:drho_coeff");
+
+  // create 2 FFTs and a Remap
+  // 1st FFT keeps data in FFT decomposition
+  // 2nd FFT returns data in 3d brick decomposition
+  // remap takes data from 3d brick to FFT decomposition
+
+  int tmp;
+
+  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   0,0,&tmp,collective_flag);
+
+  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                   0,0,&tmp,collective_flag);
+
+  remap = new Remap(lmp,world,
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                    1,0,0,FFT_PRECISION,collective_flag);
+
+  // create ghost grid object for rho and electric field communication
+  // also create 2 bufs for ghost grid cell comm, passed to GridComm methods
+  
+  gc = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+		     nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+		     nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out);
+
+  gc->setup(ngc_buf1,ngc_buf2);
+
+  if (differentiation_flag) npergrid = 1;
+  else npergrid = 3;
+
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
+}
+
+/* ----------------------------------------------------------------------
+   deallocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::deallocate()
+{
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+
+  if (differentiation_flag == 1) {
+    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy(sf_precoeff1);
+    memory->destroy(sf_precoeff2);
+    memory->destroy(sf_precoeff3);
+    memory->destroy(sf_precoeff4);
+    memory->destroy(sf_precoeff5);
+    memory->destroy(sf_precoeff6);
+  } else {
+    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+  }
+
+  memory->destroy(density_fft);
+  memory->destroy(greensfn);
+  memory->destroy(work1);
+  memory->destroy(work2);
+  memory->destroy(vg);
+
+  if (triclinic == 0) {
+    memory->destroy1d_offset(fkx,nxlo_fft);
+    memory->destroy1d_offset(fky,nylo_fft);
+    memory->destroy1d_offset(fkz,nzlo_fft);
+  } else {
+    memory->destroy(fkx);
+    memory->destroy(fky);
+    memory->destroy(fkz);
+  }
+
+  memory->destroy(gf_b);
+  if (stagger_flag) gf_b = NULL;
+  memory->destroy2d_offset(rho1d,-order_allocated/2);
+  memory->destroy2d_offset(drho1d,-order_allocated/2);
+  memory->destroy2d_offset(rho_coeff,(1-order_allocated)/2);
+  memory->destroy2d_offset(drho_coeff,(1-order_allocated)/2);
+
+  delete fft1;
+  delete fft2;
+  delete remap;
+  delete gc;
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+}
+
+/* ----------------------------------------------------------------------
+   allocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::allocate_peratom()
+{
+  peratom_allocate_flag = 1;
+
+  if (differentiation_flag != 1)
+    memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                            nxlo_out,nxhi_out,"pppm:u_brick");
+
+  memory->create3d_offset(v0_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v0_brick");
+
+  memory->create3d_offset(v1_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v1_brick");
+  memory->create3d_offset(v2_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v2_brick");
+  memory->create3d_offset(v3_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v3_brick");
+  memory->create3d_offset(v4_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v4_brick");
+  memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:v5_brick");
+
+  // use same GC ghost grid object for peratom grid communication
+  // but need to reallocate a larger gc_buf1 and gc_buf2
+
+  if (differentiation_flag) npergrid = 6;
+  else npergrid = 7;
+
+  memory->destroy(gc_buf1);
+  memory->destroy(gc_buf2);
+  memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1");
+  memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2");
+}
+
+/* ----------------------------------------------------------------------
+   deallocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPM2::deallocate_peratom()
+{
+  peratom_allocate_flag = 0;
+
+  memory->destroy3d_offset(v0_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5_brick,nzlo_out,nylo_out,nxlo_out);
+
+  if (differentiation_flag != 1)
+    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+}
+
+/* ----------------------------------------------------------------------
+   set global size of PPPM grid = nx,ny,nz_pppm
+   used for charge accumulation, FFTs, and electric field interpolation
+------------------------------------------------------------------------- */
+
+void PPPM2::set_grid_global()
+{
+  // use xprd,yprd,zprd (even if triclinic, and then scale later)
+  // adjust z dimension for 2d slab PPPM
+  // 3d PPPM just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  double h;
+  bigint natoms = atom->natoms;
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    if (q2 == 0.0)
+      error->all(FLERR,"Must use kspace_modify gewald for uncharged system");
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2);
+    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
+  }
+
+  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
+  // nz_pppm uses extended zprd_slab instead of zprd
+  // reduce it until accuracy target is met
+
+  if (!gridflag) {
+
+    if (differentiation_flag == 1 || stagger_flag) {
+
+      h = h_x = h_y = h_z = 4.0/g_ewald;
+      int count = 0;
+      while (1) {
+
+        // set grid dimensions
+	
+        nx_pppm = static_cast<int> (xprd/h_x);
+        ny_pppm = static_cast<int> (yprd/h_y);
+        nz_pppm = static_cast<int> (zprd_slab/h_z);
+
+        if (nx_pppm <= 1) nx_pppm = 2;
+        if (ny_pppm <= 1) ny_pppm = 2;
+        if (nz_pppm <= 1) nz_pppm = 2;
+
+	// estimate Kspace force error
+	
+        double df_kspace = compute_df_kspace();
+
+        // break loop if the accuracy has been reached or
+        // too many loops have been performed
+
+	count++;
+        if (df_kspace <= accuracy) break;
+
+        if (count > 500) error->all(FLERR, "Could not compute grid size");
+        h *= 0.95;
+        h_x = h_y = h_z = h;
+      }
+
+    } else {
+
+      double err;
+      h_x = h_y = h_z = 1.0/g_ewald;
+
+      nx_pppm = static_cast<int> (xprd/h_x) + 1;
+      ny_pppm = static_cast<int> (yprd/h_y) + 1;
+      nz_pppm = static_cast<int> (zprd_slab/h_z) + 1;
+
+      err = estimate_ik_error(h_x,xprd,natoms);
+      while (err > accuracy) {
+        err = estimate_ik_error(h_x,xprd,natoms);
+        nx_pppm++;
+        h_x = xprd/nx_pppm;
+      }
+
+      err = estimate_ik_error(h_y,yprd,natoms);
+      while (err > accuracy) {
+        err = estimate_ik_error(h_y,yprd,natoms);
+        ny_pppm++;
+        h_y = yprd/ny_pppm;
+      }
+
+      err = estimate_ik_error(h_z,zprd_slab,natoms);
+      while (err > accuracy) {
+        err = estimate_ik_error(h_z,zprd_slab,natoms);
+        nz_pppm++;
+        h_z = zprd_slab/nz_pppm;
+      }
+    }
+
+    // scale grid for triclinic skew
+
+    if (triclinic) {
+      double tmp[3];
+      tmp[0] = nx_pppm/xprd;
+      tmp[1] = ny_pppm/yprd;
+      tmp[2] = nz_pppm/zprd;
+      lamda2xT(&tmp[0],&tmp[0]);
+      nx_pppm = static_cast<int>(tmp[0]) + 1;
+      ny_pppm = static_cast<int>(tmp[1]) + 1;
+      nz_pppm = static_cast<int>(tmp[2]) + 1;
+    }
+  }
+
+  // boost grid size until it is factorable
+
+  while (!factorable(nx_pppm)) nx_pppm++;
+  while (!factorable(ny_pppm)) ny_pppm++;
+  while (!factorable(nz_pppm)) nz_pppm++;
+
+  if (triclinic == 0) {
+    h_x = xprd/nx_pppm;
+    h_y = yprd/ny_pppm;
+    h_z = zprd_slab/nz_pppm;
+  } else {
+    double tmp[3];
+    tmp[0] = nx_pppm;
+    tmp[1] = ny_pppm;
+    tmp[2] = nz_pppm;
+    x2lamdaT(&tmp[0],&tmp[0]);
+    h_x = 1.0/tmp[0];
+    h_y = 1.0/tmp[1];
+    h_z = 1.0/tmp[2];
+  }
+
+  if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
+    error->all(FLERR,"PPPM grid is too large");
+}
+
+/* ----------------------------------------------------------------------
+   check if all factors of n are in list of factors
+   return 1 if yes, 0 if no
+------------------------------------------------------------------------- */
+
+int PPPM2::factorable(int n)
+{
+  int i;
+
+  while (n > 1) {
+    for (i = 0; i < nfactors; i++) {
+      if (n % factors[i] == 0) {
+        n /= factors[i];
+        break;
+      }
+    }
+    if (i == nfactors) return 0;
+  }
+
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   compute estimated kspace force error
+------------------------------------------------------------------------- */
+
+double PPPM2::compute_df_kspace()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+  bigint natoms = atom->natoms;
+  double df_kspace = 0.0;
+  if (differentiation_flag == 1 || stagger_flag) {
+    double qopt = compute_qopt();
+    df_kspace = sqrt(qopt/natoms)*q2/(xprd*yprd*zprd_slab);
+  } else {
+    double lprx = estimate_ik_error(h_x,xprd,natoms);
+    double lpry = estimate_ik_error(h_y,yprd,natoms);
+    double lprz = estimate_ik_error(h_z,zprd_slab,natoms);
+    df_kspace = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  }
+  return df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   compute qopt
+------------------------------------------------------------------------- */
+
+double PPPM2::compute_qopt()
+{
+  int k,l,m,nx,ny,nz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double u1,u2,sqk;
+  double sum1,sum2,sum3,sum4,dot2;
+
+  double *prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  const int twoorder = 2*order;
+
+  // loop over entire FFT grid
+  // each proc calculates contributions from every Pth grid point
+  
+  bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm;
+  int nxy_pppm = nx_pppm * ny_pppm;
+  
+  double qopt = 0.0;
+
+  for (bigint i = me; i < ngridtotal; i += nprocs) {
+    k = i % nx_pppm;
+    l = (i/nx_pppm) % ny_pppm;
+    m = i / nxy_pppm;
+
+    const int kper = k - nx_pppm*(2*k/nx_pppm);
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const int mper = m - nz_pppm*(2*m/nz_pppm);
+
+    sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+    if (sqk == 0.0) continue;
+
+    sum1 = sum2 = sum3 = sum4 = 0.0;
+    
+    for (nx = -2; nx <= 2; nx++) {
+      qx = unitkx*(kper+nx_pppm*nx);
+      sx = exp(-0.25*square(qx/g_ewald));
+      argx = 0.5*qx*xprd/nx_pppm;
+      wx = powsinxx(argx,twoorder);
+      qx *= qx;
+      
+      for (ny = -2; ny <= 2; ny++) {
+	qy = unitky*(lper+ny_pppm*ny);
+	sy = exp(-0.25*square(qy/g_ewald));
+	argy = 0.5*qy*yprd/ny_pppm;
+	wy = powsinxx(argy,twoorder);
+	qy *= qy;
+	
+	for (nz = -2; nz <= 2; nz++) {
+	  qz = unitkz*(mper+nz_pppm*nz);
+	  sz = exp(-0.25*square(qz/g_ewald));
+	  argz = 0.5*qz*zprd_slab/nz_pppm;
+	  wz = powsinxx(argz,twoorder);
+	  qz *= qz;
+	  
+	  dot2 = qx+qy+qz;
+	  u1   = sx*sy*sz;
+	  u2   = wx*wy*wz;
+	  
+	  sum1 += u1*u1/dot2*MY_4PI*MY_4PI;
+	  sum2 += u1 * u2 * MY_4PI;
+	  sum3 += u2;
+	  sum4 += dot2*u2;
+	}
+      }
+    }
+    
+    sum2 *= sum2;
+    qopt += sum1 - sum2/(sum3*sum4);
+  }
+
+  // sum qopt over all procs
+  
+  double qopt_all;
+  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
+  return qopt_all;
+}
+
+/* ----------------------------------------------------------------------
+   estimate kspace force error for ik method
+------------------------------------------------------------------------- */
+
+double PPPM2::estimate_ik_error(double h, double prd, bigint natoms)
+{
+  double sum = 0.0;
+  if (natoms == 0) return 0.0;
+  for (int m = 0; m < order; m++)
+    sum += acons[order][m] * pow(h*g_ewald,2.0*m);
+  double value = q2 * pow(h*g_ewald,(double)order) *
+    sqrt(g_ewald*prd*sqrt(MY_2PI)*sum/natoms) / (prd*prd);
+
+  return value;
+}
+
+/* ----------------------------------------------------------------------
+   adjust the g_ewald parameter to near its optimal value
+   using a Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+void PPPM2::adjust_gewald()
+{
+  double dx;
+
+  for (int i = 0; i < LARGE; i++) {
+    dx = newton_raphson_f() / derivf();
+    g_ewald -= dx;
+    if (fabs(newton_raphson_f()) < SMALL) return;
+  }
+  error->all(FLERR, "Could not compute g_ewald");
+}
+
+/* ----------------------------------------------------------------------
+   calculate f(x) using Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+double PPPM2::newton_raphson_f()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atom->natoms;
+
+  double df_rspace = 2.0*q2*exp(-g_ewald*g_ewald*cutoff*cutoff) /
+       sqrt(natoms*cutoff*xprd*yprd*zprd);
+
+  double df_kspace = compute_df_kspace();
+
+  return df_rspace - df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   calculate numerical derivative f'(x) using forward difference
+   [f(x + h) - f(x)] / h
+------------------------------------------------------------------------- */
+
+double PPPM2::derivf()
+{
+  double h = 0.000001;  //Derivative step-size
+  double df,f1,f2,g_ewald_old;
+
+  f1 = newton_raphson_f();
+  g_ewald_old = g_ewald;
+  g_ewald += h;
+  f2 = newton_raphson_f();
+  g_ewald = g_ewald_old;
+  df = (f2 - f1)/h;
+
+  return df;
+}
+
+/* ----------------------------------------------------------------------
+   calculate the final estimate of the accuracy
+------------------------------------------------------------------------- */
+
+double PPPM2::final_accuracy()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atom->natoms;
+  if (natoms == 0) natoms = 1; // avoid division by zero
+
+  double df_kspace = compute_df_kspace();
+  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd);
+  double df_rspace = 2.0 * q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
+  double df_table = estimate_table_accuracy(q2_over_sqrt,df_rspace);
+  double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace +
+                                   df_table*df_table);
+
+  return estimated_accuracy;
+}
+
+/* ----------------------------------------------------------------------
+   set local subset of PPPM/FFT grid that I own
+   n xyz lo/hi in = 3d brick that I own (inclusive)
+   n xyz lo/hi out = 3d brick + ghost cells in 6 directions (inclusive)
+   n xyz lo/hi fft = FFT columns that I own (all of x dim, 2d decomp in yz)
+------------------------------------------------------------------------- */
+
+void PPPM2::set_grid_local()
+{
+  // global indices of PPPM grid range from 0 to N-1
+  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
+  //   global PPPM grid that I own without ghost cells
+  // for slab PPPM, assign z grid as if it were not extended
+  // both non-tiled and tiled proc layouts use 0-1 fractional sumdomain info
+  
+  if (comm->layout != Comm::LAYOUT_TILED) {
+    nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
+    nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
+
+    nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
+    nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
+
+    nzlo_in = static_cast<int>
+      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
+    nzhi_in = static_cast<int>
+      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
+    
+  } else {
+    nxlo_in = static_cast<int> (comm->mysplit[0][0] * nx_pppm);
+    nxhi_in = static_cast<int> (comm->mysplit[0][1] * nx_pppm) - 1;
+
+    nylo_in = static_cast<int> (comm->mysplit[1][0] * ny_pppm);
+    nyhi_in = static_cast<int> (comm->mysplit[1][1] * ny_pppm) - 1;
+
+    nzlo_in = static_cast<int> (comm->mysplit[2][0] * nz_pppm/slab_volfactor);
+    nzhi_in = static_cast<int> (comm->mysplit[2][1] * nz_pppm/slab_volfactor) - 1;
+  }
+
+  // nlower,nupper = stencil size for mapping particles to PPPM grid
+
+  nlower = -(order-1)/2;
+  nupper = order/2;
+
+  // shift values for particle <-> grid mapping
+  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+  if (order % 2) shift = OFFSET + 0.5;
+  else shift = OFFSET;
+  if (order % 2) shiftone = 0.0;
+  else shiftone = 0.5;
+
+  // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
+  //   global PPPM grid that my particles can contribute charge to
+  // effectively nlo_in,nhi_in + ghost cells
+  // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
+  //           position a particle in my box can be at
+  // dist[3] = particle position bound = subbox + skin/2.0 + qdist
+  //   qdist = offset due to TIP4P fictitious charge
+  //   convert to triclinic if necessary
+  // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
+  // for slab PPPM, assign z grid as if it were not extended
+
+  double *prd,*sublo,*subhi;
+
+  if (triclinic == 0) {
+    prd = domain->prd;
+    boxlo = domain->boxlo;
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    prd = domain->prd_lamda;
+    boxlo = domain->boxlo_lamda;
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  double dist[3] = {0.0,0.0,0.0};
+  double cuthalf = 0.5*neighbor->skin + qdist;
+  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
+  else kspacebbox(cuthalf,&dist[0]);
+
+  int nlo,nhi;
+  nlo = nhi = 0;
+
+  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
+                            nx_pppm/xprd + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
+                            nx_pppm/xprd + shift) - OFFSET;
+  nxlo_out = nlo + nlower;
+  nxhi_out = nhi + nupper;
+
+  nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
+                            ny_pppm/yprd + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
+                            ny_pppm/yprd + shift) - OFFSET;
+  nylo_out = nlo + nlower;
+  nyhi_out = nhi + nupper;
+
+  nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
+                            nz_pppm/zprd_slab + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
+                            nz_pppm/zprd_slab + shift) - OFFSET;
+  nzlo_out = nlo + nlower;
+  nzhi_out = nhi + nupper;
+
+  if (stagger_flag) {
+    nxhi_out++;
+    nyhi_out++;
+    nzhi_out++;
+  }
+
+  // for slab PPPM, change the grid boundary for processors at +z end
+  //   to include the empty volume between periodically repeating slabs
+  // for slab PPPM, want charge data communicated from -z proc to +z proc,
+  //   but not vice versa, also want field data communicated from +z proc to
+  //   -z proc, but not vice versa
+  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
+  // also insure no other procs use ghost cells beyond +z limit
+  // differnet logic for non-tiled vs tiled decomposition
+  
+  if (slabflag == 1) {
+    if (comm->layout != Comm::LAYOUT_TILED) {
+      if (comm->myloc[2] == comm->procgrid[2]-1) nzhi_in = nzhi_out = nz_pppm - 1;
+    } else {
+      if (comm->mysplit[2][1] == 1.0) nzhi_in = nzhi_out = nz_pppm - 1;
+    }
+    nzhi_out = MIN(nzhi_out,nz_pppm-1);
+  }
+
+  // x-pencil decomposition of FFT mesh
+  // global indices range from 0 to N-1
+  // each proc owns entire x-dimension, clumps of columns in y,z dimensions
+  // npey_fft,npez_fft = # of procs in y,z dims
+  // if nprocs is small enough, proc can own 1 or more entire xy planes,
+  //   else proc owns 2d sub-blocks of yz plane
+  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
+  // nlo_fft,nhi_fft = lower/upper limit of the section
+  //   of the global FFT mesh that I own in x-pencil decomposition
+
+  int npey_fft,npez_fft;
+  if (nz_pppm >= nprocs) {
+    npey_fft = 1;
+    npez_fft = nprocs;
+  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+  int me_y = me % npey_fft;
+  int me_z = me / npey_fft;
+
+  nxlo_fft = 0;
+  nxhi_fft = nx_pppm - 1;
+  nylo_fft = me_y*ny_pppm/npey_fft;
+  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
+  nzlo_fft = me_z*nz_pppm/npez_fft;
+  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+
+  // ngrid = count of PPPM grid pts owned by this proc, including ghosts
+
+  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+
+  // count of FFT grids pts owned by this proc, without ghosts
+  // nfft = FFT points in x-pencil FFT decomposition on this proc
+  // nfft_brick = FFT points in 3d brick-decomposition on this proc
+  // nfft_both = greater of 2 values
+
+  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
+    (nzhi_fft-nzlo_fft+1);
+  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
+    (nzhi_in-nzlo_in+1);
+  nfft_both = MAX(nfft,nfft_brick);
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_denom()
+{
+  int k,l,m;
+
+  for (l = 1; l < order; l++) gf_b[l] = 0.0;
+  gf_b[0] = 1.0;
+
+  for (m = 1; m < order; m++) {
+    for (l = m; l > 0; l--)
+      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
+    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
+  }
+
+  bigint ifact = 1;
+  for (k = 1; k < 2*order; k++) ifact *= k;
+  double gaminv = 1.0/ifact;
+  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_ik()
+{
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+  double sqk;
+
+  int k,l,m,n,nx,ny,nz,kper,lper,mper;
+
+  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int twoorder = 2*order;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+
+        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+        if (sqk != 0.0) {
+          numerator = 12.5663706/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            qx = unitkx*(kper+nx_pppm*nx);
+            sx = exp(-0.25*square(qx/g_ewald));
+            argx = 0.5*qx*xprd/nx_pppm;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              qy = unitky*(lper+ny_pppm*ny);
+              sy = exp(-0.25*square(qy/g_ewald));
+              argy = 0.5*qy*yprd/ny_pppm;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                qz = unitkz*(mper+nz_pppm*nz);
+                sz = exp(-0.25*square(qz/g_ewald));
+                argz = 0.5*qz*zprd_slab/nz_pppm;
+                wz = powsinxx(argz,twoorder);
+
+                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+                dot2 = qx*qx+qy*qy+qz*qz;
+                sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+              }
+            }
+          }
+          greensfn[n++] = numerator*sum1/denominator;
+        } else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_ik_triclinic()
+{
+  double snx,sny,snz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+  double sqk;
+
+  int k,l,m,n,nx,ny,nz,kper,lper,mper;
+
+  double tmp[3];
+  tmp[0] = (g_ewald/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[1] = (g_ewald/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[2] = (g_ewald/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25);
+  lamda2xT(&tmp[0],&tmp[0]);
+  const int nbx = static_cast<int> (tmp[0]);
+  const int nby = static_cast<int> (tmp[1]);
+  const int nbz = static_cast<int> (tmp[2]);
+
+  const int twoorder = 2*order;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(MY_PI*mper/nz_pppm));
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(MY_PI*lper/ny_pppm));
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(MY_PI*kper/nx_pppm));
+
+        double unitk_lamda[3];
+        unitk_lamda[0] = 2.0*MY_PI*kper;
+        unitk_lamda[1] = 2.0*MY_PI*lper;
+        unitk_lamda[2] = 2.0*MY_PI*mper;
+        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+
+        sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
+
+        if (sqk != 0.0) {
+          numerator = 12.5663706/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            argx = MY_PI*kper/nx_pppm + MY_PI*nx;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              argy = MY_PI*lper/ny_pppm + MY_PI*ny;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                argz = MY_PI*mper/nz_pppm + MY_PI*nz;
+                wz = powsinxx(argz,twoorder);
+
+                double b[3];
+                b[0] = 2.0*MY_PI*nx_pppm*nx;
+                b[1] = 2.0*MY_PI*ny_pppm*ny;
+                b[2] = 2.0*MY_PI*nz_pppm*nz;
+                x2lamdaT(&b[0],&b[0]);
+
+                qx = unitk_lamda[0]+b[0];
+                sx = exp(-0.25*square(qx/g_ewald));
+
+                qy = unitk_lamda[1]+b[1];
+                sy = exp(-0.25*square(qy/g_ewald));
+
+                qz = unitk_lamda[2]+b[2];
+                sz = exp(-0.25*square(qz/g_ewald));
+
+                dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
+                dot2 = qx*qx+qy*qy+qz*qz;
+                sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+              }
+            }
+          }
+          greensfn[n++] = numerator*sum1/denominator;
+        } else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute optimized Green's function for energy calculation
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_gf_ad()
+{
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz,sqk;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double numerator,denominator;
+  int k,l,m,n,kper,lper,mper;
+
+  const int twoorder = 2*order;
+
+  for (int i = 0; i < 6; i++) sf_coeff[i] = 0.0;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    qz = unitkz*mper;
+    snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
+    sz = exp(-0.25*square(qz/g_ewald));
+    argz = 0.5*qz*zprd_slab/nz_pppm;
+    wz = powsinxx(argz,twoorder);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      qy = unitky*lper;
+      sny = square(sin(0.5*qy*yprd/ny_pppm));
+      sy = exp(-0.25*square(qy/g_ewald));
+      argy = 0.5*qy*yprd/ny_pppm;
+      wy = powsinxx(argy,twoorder);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        qx = unitkx*kper;
+        snx = square(sin(0.5*qx*xprd/nx_pppm));
+        sx = exp(-0.25*square(qx/g_ewald));
+        argx = 0.5*qx*xprd/nx_pppm;
+        wx = powsinxx(argx,twoorder);
+
+        sqk = qx*qx + qy*qy + qz*qz;
+
+        if (sqk != 0.0) {
+          numerator = MY_4PI/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
+          sf_coeff[0] += sf_precoeff1[n]*greensfn[n];
+          sf_coeff[1] += sf_precoeff2[n]*greensfn[n];
+          sf_coeff[2] += sf_precoeff3[n]*greensfn[n];
+          sf_coeff[3] += sf_precoeff4[n]*greensfn[n];
+          sf_coeff[4] += sf_precoeff5[n]*greensfn[n];
+          sf_coeff[5] += sf_precoeff6[n]*greensfn[n];
+          n++;
+        } else {
+          greensfn[n] = 0.0;
+          sf_coeff[0] += sf_precoeff1[n]*greensfn[n];
+          sf_coeff[1] += sf_precoeff2[n]*greensfn[n];
+          sf_coeff[2] += sf_precoeff3[n]*greensfn[n];
+          sf_coeff[3] += sf_precoeff4[n]*greensfn[n];
+          sf_coeff[4] += sf_precoeff5[n]*greensfn[n];
+          sf_coeff[5] += sf_precoeff6[n]*greensfn[n];
+          n++;
+        }
+      }
+    }
+  }
+
+  // compute the coefficients for the self-force correction
+
+  double prex, prey, prez;
+  prex = prey = prez = MY_PI/volume;
+  prex *= nx_pppm/xprd;
+  prey *= ny_pppm/yprd;
+  prez *= nz_pppm/zprd_slab;
+  sf_coeff[0] *= prex;
+  sf_coeff[1] *= prex*2;
+  sf_coeff[2] *= prey;
+  sf_coeff[3] *= prey*2;
+  sf_coeff[4] *= prez;
+  sf_coeff[5] *= prez*2;
+
+  // communicate values with other procs
+
+  double tmp[6];
+  MPI_Allreduce(sf_coeff,tmp,6,MPI_DOUBLE,MPI_SUM,world);
+  for (n = 0; n < 6; n++) sf_coeff[n] = tmp[n];
+}
+
+/* ----------------------------------------------------------------------
+   compute self force coefficients for ad-differentiation scheme
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_sf_precoeff()
+{
+  int i,k,l,m,n;
+  int nx,ny,nz,kper,lper,mper;
+  double wx0[5],wy0[5],wz0[5],wx1[5],wy1[5],wz1[5],wx2[5],wy2[5],wz2[5];
+  double qx0,qy0,qz0,qx1,qy1,qz1,qx2,qy2,qz2;
+  double u0,u1,u2,u3,u4,u5,u6;
+  double sum1,sum2,sum3,sum4,sum5,sum6;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+
+        sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = 0.0;
+        for (i = 0; i < 5; i++) {
+
+          qx0 = MY_2PI*(kper+nx_pppm*(i-2));
+          qx1 = MY_2PI*(kper+nx_pppm*(i-1));
+          qx2 = MY_2PI*(kper+nx_pppm*(i  ));
+          wx0[i] = powsinxx(0.5*qx0/nx_pppm,order);
+          wx1[i] = powsinxx(0.5*qx1/nx_pppm,order);
+          wx2[i] = powsinxx(0.5*qx2/nx_pppm,order);
+
+          qy0 = MY_2PI*(lper+ny_pppm*(i-2));
+          qy1 = MY_2PI*(lper+ny_pppm*(i-1));
+          qy2 = MY_2PI*(lper+ny_pppm*(i  ));
+          wy0[i] = powsinxx(0.5*qy0/ny_pppm,order);
+          wy1[i] = powsinxx(0.5*qy1/ny_pppm,order);
+          wy2[i] = powsinxx(0.5*qy2/ny_pppm,order);
+
+          qz0 = MY_2PI*(mper+nz_pppm*(i-2));
+          qz1 = MY_2PI*(mper+nz_pppm*(i-1));
+          qz2 = MY_2PI*(mper+nz_pppm*(i  ));
+
+          wz0[i] = powsinxx(0.5*qz0/nz_pppm,order);
+          wz1[i] = powsinxx(0.5*qz1/nz_pppm,order);
+          wz2[i] = powsinxx(0.5*qz2/nz_pppm,order);
+        }
+
+        for (nx = 0; nx < 5; nx++) {
+          for (ny = 0; ny < 5; ny++) {
+            for (nz = 0; nz < 5; nz++) {
+              u0 = wx0[nx]*wy0[ny]*wz0[nz];
+              u1 = wx1[nx]*wy0[ny]*wz0[nz];
+              u2 = wx2[nx]*wy0[ny]*wz0[nz];
+              u3 = wx0[nx]*wy1[ny]*wz0[nz];
+              u4 = wx0[nx]*wy2[ny]*wz0[nz];
+              u5 = wx0[nx]*wy0[ny]*wz1[nz];
+              u6 = wx0[nx]*wy0[ny]*wz2[nz];
+
+              sum1 += u0*u1;
+              sum2 += u0*u2;
+              sum3 += u0*u3;
+              sum4 += u0*u4;
+              sum5 += u0*u5;
+              sum6 += u0*u6;
+            }
+          }
+        }
+
+        // store values
+
+        sf_precoeff1[n] = sum1;
+        sf_precoeff2[n] = sum2;
+        sf_precoeff3[n] = sum3;
+        sf_precoeff4[n] = sum4;
+        sf_precoeff5[n] = sum5;
+        sf_precoeff6[n++] = sum6;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+void PPPM2::particle_map()
+{
+  int nx,ny,nz;
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  int flag = 0;
+
+  if (!std::isfinite(boxlo[0]) || !std::isfinite(boxlo[1]) || !std::isfinite(boxlo[2]))
+    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
+
+  for (int i = 0; i < nlocal; i++) {
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    nx = static_cast<int> ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET;
+    ny = static_cast<int> ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET;
+    nz = static_cast<int> ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET;
+
+    part2grid[i][0] = nx;
+    part2grid[i][1] = ny;
+    part2grid[i][2] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+      flag = 1;
+  }
+
+  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPM2::make_rho()
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+  // clear 3d density array
+
+  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double *q = atom->q;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    z0 = delvolinv * q[i];
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          density_brick[mz][my][mx] += x0*rho1d[0][l];
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   remap density from 3d brick decomposition to FFT decomposition
+------------------------------------------------------------------------- */
+
+void PPPM2::brick2fft()
+{
+  int n,ix,iy,iz;
+
+  // copy grabs inner portion of density from 3d brick
+  // remap could be done as pre-stage of FFT,
+  //   but this works optimally on only double values, not complex values
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+        density_fft[n++] = density_brick[iz][iy][ix];
+
+  remap->perform(density_fft,density_fft,work1);
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson()
+{
+  if (differentiation_flag == 1) poisson_ad();
+  else poisson_ik();
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_ik()
+{
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] = density_fft[i];
+    work1[n++] = ZEROF;
+  }
+
+  fft1->compute(work1,work1,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
+        if (eflag_global) energy += eng;
+        n += 2;
+      }
+    } else {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        energy +=
+          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        n += 2;
+      }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] *= scaleinv * greensfn[i];
+    work1[n++] *= scaleinv * greensfn[i];
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (evflag_atom) poisson_peratom();
+
+  // triclinic system
+
+  if (triclinic) {
+    poisson_ik_triclinic();
+    return;
+  }
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkx[i]*work1[n+1];
+        work2[n+1] = -fkx[i]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdx_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // y direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fky[j]*work1[n+1];
+        work2[n+1] = -fky[j]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdy_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkz[k]*work1[n+1];
+        work2[n+1] = -fkz[k]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdz_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_ik_triclinic()
+{
+  int i,j,k,n;
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fkx[i]*work1[n+1];
+    work2[n+1] = -fkx[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdx_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // y direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fky[i]*work1[n+1];
+    work2[n+1] = -fky[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdy_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fkz[i]*work1[n+1];
+    work2[n+1] = -fkz[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdz_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ad
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_ad()
+{
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] = density_fft[i];
+    work1[n++] = ZEROF;
+  }
+
+  fft1->compute(work1,work1,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
+        if (eflag_global) energy += eng;
+        n += 2;
+      }
+    } else {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        energy +=
+          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        n += 2;
+      }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] *= scaleinv * greensfn[i];
+    work1[n++] *= scaleinv * greensfn[i];
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (vflag_atom) poisson_peratom();
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n];
+    work2[n+1] = work1[n+1];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        u_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPM2::poisson_peratom()
+{
+  int i,j,k,n;
+
+  // energy
+
+  if (eflag_atom && differentiation_flag != 1) {
+    n = 0;
+    for (i = 0; i < nfft; i++) {
+      work2[n] = work1[n];
+      work2[n+1] = work1[n+1];
+      n += 2;
+    }
+
+    fft2->compute(work2,work2,-1);
+
+    n = 0;
+    for (k = nzlo_in; k <= nzhi_in; k++)
+      for (j = nylo_in; j <= nyhi_in; j++)
+        for (i = nxlo_in; i <= nxhi_in; i++) {
+          u_brick[k][j][i] = work2[n];
+          n += 2;
+        }
+  }
+
+  // 6 components of virial in v0 thru v5
+
+  if (!vflag_atom) return;
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][0];
+    work2[n+1] = work1[n+1]*vg[i][0];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][1];
+    work2[n+1] = work1[n+1]*vg[i][1];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][2];
+    work2[n+1] = work1[n+1]*vg[i][2];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][3];
+    work2[n+1] = work1[n+1]*vg[i][3];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][4];
+    work2[n+1] = work1[n+1]*vg[i][4];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = work1[n]*vg[i][5];
+    work2[n+1] = work1[n+1]*vg[i][5];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce()
+{
+  if (differentiation_flag == 1) fieldforce_ad();
+  else fieldforce_ik();
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce_ik()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ekx,eky,ekz;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ekx = eky = ekz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          ekx -= x0*vdx_brick[mz][my][mx];
+          eky -= x0*vdy_brick[mz][my][mx];
+          ekz -= x0*vdz_brick[mz][my][mx];
+        }
+      }
+    }
+
+    // convert E-field to force
+
+    const double qfactor = qqrd2e * scale * q[i];
+    f[i][0] += qfactor*ekx;
+    f[i][1] += qfactor*eky;
+    if (slabflag != 2) f[i][2] += qfactor*ekz;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce_ad()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR ekx,eky,ekz;
+  double s1,s2,s3;
+  double sf = 0.0;
+  double *prd;
+
+  prd = domain->prd;
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+
+  double hx_inv = nx_pppm/xprd;
+  double hy_inv = ny_pppm/yprd;
+  double hz_inv = nz_pppm/zprd;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+    compute_drho1d(dx,dy,dz);
+
+    ekx = eky = ekz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx];
+        }
+      }
+    }
+    ekx *= hx_inv;
+    eky *= hy_inv;
+    ekz *= hz_inv;
+
+    // convert E-field to force and subtract self forces
+
+    const double qfactor = qqrd2e * scale;
+
+    s1 = x[i][0]*hx_inv;
+    s2 = x[i][1]*hy_inv;
+    s3 = x[i][2]*hz_inv;
+    sf = sf_coeff[0]*sin(2*MY_PI*s1);
+    sf += sf_coeff[1]*sin(4*MY_PI*s1);
+    sf *= 2*q[i]*q[i];
+    f[i][0] += qfactor*(ekx*q[i] - sf);
+
+    sf = sf_coeff[2]*sin(2*MY_PI*s2);
+    sf += sf_coeff[3]*sin(4*MY_PI*s2);
+    sf *= 2*q[i]*q[i];
+    f[i][1] += qfactor*(eky*q[i] - sf);
+
+
+    sf = sf_coeff[4]*sin(2*MY_PI*s3);
+    sf += sf_coeff[5]*sin(4*MY_PI*s3);
+    sf *= 2*q[i]*q[i];
+    if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPM2::fieldforce_peratom()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double *q = atom->q;
+  double **x = atom->x;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          if (eflag_atom) u += x0*u_brick[mz][my][mx];
+          if (vflag_atom) {
+            v0 += x0*v0_brick[mz][my][mx];
+            v1 += x0*v1_brick[mz][my][mx];
+            v2 += x0*v2_brick[mz][my][mx];
+            v3 += x0*v3_brick[mz][my][mx];
+            v4 += x0*v4_brick[mz][my][mx];
+            v5 += x0*v5_brick[mz][my][mx];
+          }
+        }
+      }
+    }
+
+    if (eflag_atom) eatom[i] += q[i]*u;
+    if (vflag_atom) {
+      vatom[i][0] += q[i]*v0;
+      vatom[i][1] += q[i]*v1;
+      vatom[i][2] += q[i]*v2;
+      vatom[i][3] += q[i]*v3;
+      vatom[i][4] += q[i]*v4;
+      vatom[i][5] += q[i]*v5;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack own values to buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPM2::pack_forward2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+  
+  int n = 0;
+
+  if (flag == FORWARD_IK) {
+    FFT_SCALAR *xsrc = &vdx_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *ysrc = &vdy_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *zsrc = &vdz_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = xsrc[list[i]];
+      buf[n++] = ysrc[list[i]];
+      buf[n++] = zsrc[list[i]];
+    }
+  } else if (flag == FORWARD_AD) {
+    FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      buf[i] = src[list[i]];
+  } else if (flag == FORWARD_IK_PERATOM) {
+    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      if (eflag_atom) buf[n++] = esrc[list[i]];
+      if (vflag_atom) {
+        buf[n++] = v0src[list[i]];
+        buf[n++] = v1src[list[i]];
+        buf[n++] = v2src[list[i]];
+        buf[n++] = v3src[list[i]];
+        buf[n++] = v4src[list[i]];
+        buf[n++] = v5src[list[i]];
+      }
+    }
+  } else if (flag == FORWARD_AD_PERATOM) {
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = v0src[list[i]];
+      buf[n++] = v1src[list[i]];
+      buf[n++] = v2src[list[i]];
+      buf[n++] = v3src[list[i]];
+      buf[n++] = v4src[list[i]];
+      buf[n++] = v5src[list[i]];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's own values from buf and set own ghost values
+------------------------------------------------------------------------- */
+
+void PPPM2::unpack_forward2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+
+  int n = 0;
+
+  if (flag == FORWARD_IK) {
+    FFT_SCALAR *xdest = &vdx_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *ydest = &vdy_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *zdest = &vdz_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      xdest[list[i]] = buf[n++];
+      ydest[list[i]] = buf[n++];
+      zdest[list[i]] = buf[n++];
+    }
+  } else if (flag == FORWARD_AD) {
+    FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      dest[list[i]] = buf[i];
+  } else if (flag == FORWARD_IK_PERATOM) {
+    FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      if (eflag_atom) esrc[list[i]] = buf[n++];
+      if (vflag_atom) {
+        v0src[list[i]] = buf[n++];
+        v1src[list[i]] = buf[n++];
+        v2src[list[i]] = buf[n++];
+        v3src[list[i]] = buf[n++];
+        v4src[list[i]] = buf[n++];
+        v5src[list[i]] = buf[n++];
+      }
+    }
+  } else if (flag == FORWARD_AD_PERATOM) {
+    FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      v0src[list[i]] = buf[n++];
+      v1src[list[i]] = buf[n++];
+      v2src[list[i]] = buf[n++];
+      v3src[list[i]] = buf[n++];
+      v4src[list[i]] = buf[n++];
+      v5src[list[i]] = buf[n++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack ghost values into buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPM2::pack_reverse2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+    
+  if (flag == REVERSE_RHO) {
+    FFT_SCALAR *src = &density_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      buf[i] = src[list[i]];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's ghost values from buf and add to own values
+------------------------------------------------------------------------- */
+
+void PPPM2::unpack_reverse2(int flag, void *pbuf, int nlist, int *list)
+{
+  FFT_SCALAR *buf = (FFT_SCALAR *) pbuf;
+
+  if (flag == REVERSE_RHO) {
+    FFT_SCALAR *dest = &density_brick[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++)
+      dest[list[i]] += buf[i];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   map nprocs to NX by NY grid as PX by PY procs - return optimal px,py
+------------------------------------------------------------------------- */
+
+void PPPM2::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py)
+{
+  // loop thru all possible factorizations of nprocs
+  // surf = surface area of largest proc sub-domain
+  // innermost if test minimizes surface area and surface/volume ratio
+
+  int bestsurf = 2 * (nx + ny);
+  int bestboxx = 0;
+  int bestboxy = 0;
+
+  int boxx,boxy,surf,ipx,ipy;
+
+  ipx = 1;
+  while (ipx <= nprocs) {
+    if (nprocs % ipx == 0) {
+      ipy = nprocs/ipx;
+      boxx = nx/ipx;
+      if (nx % ipx) boxx++;
+      boxy = ny/ipy;
+      if (ny % ipy) boxy++;
+      surf = boxx + boxy;
+      if (surf < bestsurf ||
+          (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) {
+        bestsurf = surf;
+        bestboxx = boxx;
+        bestboxy = boxy;
+        *px = ipx;
+        *py = ipy;
+      }
+    }
+    ipx++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_rho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
+                         const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-1; l >= 0; l--) {
+      r1 = rho_coeff[l][k] + r1*dx;
+      r2 = rho_coeff[l][k] + r2*dy;
+      r3 = rho_coeff[l][k] + r3*dz;
+    }
+    rho1d[0][k] = r1;
+    rho1d[1][k] = r2;
+    rho1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into drho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_drho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
+                          const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-2; l >= 0; l--) {
+      r1 = drho_coeff[l][k] + r1*dx;
+      r2 = drho_coeff[l][k] + r2*dy;
+      r3 = drho_coeff[l][k] + r3*dz;
+    }
+    drho1d[0][k] = r1;
+    drho1d[1][k] = r2;
+    drho1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   generate coeffients for the weight function of order n
+
+              (n-1)
+  Wn(x) =     Sum    wn(k,x) , Sum is over every other integer
+           k=-(n-1)
+  For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1
+      k is odd integers if n is even and even integers if n is odd
+              ---
+             | n-1
+             | Sum a(l,j)*(x-k/2)**l   if abs(x-k/2) < 1/2
+  wn(k,x) = <  l=0
+             |
+             |  0                       otherwise
+              ---
+  a coeffients are packed into the array rho_coeff to eliminate zeros
+  rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k)
+------------------------------------------------------------------------- */
+
+void PPPM2::compute_rho_coeff()
+{
+  int j,k,l,m;
+  FFT_SCALAR s;
+
+  FFT_SCALAR **a;
+  memory->create2d_offset(a,order,-order,order,"pppm:a");
+
+  for (k = -order; k <= order; k++)
+    for (l = 0; l < order; l++)
+      a[l][k] = 0.0;
+
+  a[0][0] = 1.0;
+  for (j = 1; j < order; j++) {
+    for (k = -j; k <= j; k += 2) {
+      s = 0.0;
+      for (l = 0; l < j; l++) {
+        a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1);
+#ifdef FFT_SINGLE
+        s += powf(0.5,(float) l+1) *
+          (a[l][k-1] + powf(-1.0,(float) l) * a[l][k+1]) / (l+1);
+#else
+        s += pow(0.5,(double) l+1) *
+          (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1);
+#endif
+      }
+      a[0][k] = s;
+    }
+  }
+
+  m = (1-order)/2;
+  for (k = -(order-1); k < order; k += 2) {
+    for (l = 0; l < order; l++)
+      rho_coeff[l][m] = a[l][k];
+    for (l = 1; l < order; l++)
+      drho_coeff[l-1][m] = l*a[l][k];
+    m++;
+  }
+
+  memory->destroy2d_offset(a,-order);
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPM2::slabcorr()
+{
+  // compute local contribution to global dipole moment
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double dipole = 0.0;
+  for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2];
+
+  // sum local contributions to get global dipole moment
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  double dipole_r2 = 0.0;
+  if (eflag_atom || fabs(qsum) > SMALL) {
+    for (int i = 0; i < nlocal; i++)
+      dipole_r2 += q[i]*x[i][2]*x[i][2];
+
+    // sum local contributions
+
+    double tmp;
+    MPI_Allreduce(&dipole_r2,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+    dipole_r2 = tmp;
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all -
+    qsum*dipole_r2 - qsum*qsum*zprd*zprd/12.0)/volume;
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) energy += qscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = qscale * MY_2PI/volume;
+    for (int i = 0; i < nlocal; i++)
+      eatom[i] += efact * q[i]*(x[i][2]*dipole_all - 0.5*(dipole_r2 +
+        qsum*x[i][2]*x[i][2]) - qsum*zprd*zprd/12.0);
+  }
+
+  // add on force corrections
+
+  double ffact = qscale * (-4.0*MY_PI/volume);
+  double **f = atom->f;
+
+  for (int i = 0; i < nlocal; i++) f[i][2] += ffact * q[i]*(dipole_all - qsum*x[i][2]);
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 1d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPM2::timing_1d(int n, double &time1d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->timing1d(work1,nfft_both,1);
+    fft2->timing1d(work1,nfft_both,-1);
+    if (differentiation_flag != 1) {
+      fft2->timing1d(work1,nfft_both,-1);
+      fft2->timing1d(work1,nfft_both,-1);
+    }
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time1d = time2 - time1;
+
+  if (differentiation_flag) return 2;
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 3d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPM2::timing_3d(int n, double &time3d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->compute(work1,work1,1);
+    fft2->compute(work1,work1,-1);
+    if (differentiation_flag != 1) {
+      fft2->compute(work1,work1,-1);
+      fft2->compute(work1,work1,-1);
+    }
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time3d = time2 - time1;
+
+  if (differentiation_flag) return 2;
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+double PPPM2::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+  if (differentiation_flag == 1) {
+    bytes += 2 * nbrick * sizeof(FFT_SCALAR);
+  } else {
+    bytes += 4 * nbrick * sizeof(FFT_SCALAR);
+  }
+
+  if (triclinic) bytes += 3 * nfft_both * sizeof(double);
+  bytes += 6 * nfft_both * sizeof(double);
+  bytes += nfft_both * sizeof(double);
+  bytes += nfft_both*5 * sizeof(FFT_SCALAR);
+
+  if (peratom_allocate_flag)
+    bytes += 6 * nbrick * sizeof(FFT_SCALAR);
+
+  if (group_allocate_flag) {
+    bytes += 2 * nbrick * sizeof(FFT_SCALAR);
+    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);;
+  }
+
+  // two GridComm bufs
+
+  bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR);
+  
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+   group-group interactions
+ ------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   compute the PPPM total long-range force and energy for groups A and B
+ ------------------------------------------------------------------------- */
+
+void PPPM2::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
+{
+  if (slabflag && triclinic)
+    error->all(FLERR,"Cannot (yet) use K-space slab "
+               "correction with compute group/group for triclinic systems");
+
+  if (differentiation_flag)
+    error->all(FLERR,"Cannot (yet) use kspace_modify "
+               "diff ad with compute group/group");
+
+  if (!group_allocate_flag) allocate_groups();
+
+  // convert atoms from box to lamda coords
+
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  e2group = 0.0; //energy
+  f2group[0] = 0.0; //force in x-direction
+  f2group[1] = 0.0; //force in y-direction
+  f2group[2] = 0.0; //force in z-direction
+
+  // map my particle charge onto my local 3d density grid
+
+  make_rho_groups(groupbit_A,groupbit_B,AA_flag);
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  // temporarily store and switch pointers so we can
+  //  use brick2fft() for groups A and B (without
+  //  writing an additional function)
+
+  FFT_SCALAR ***density_brick_real = density_brick;
+  FFT_SCALAR *density_fft_real = density_fft;
+
+  // group A
+
+  density_brick = density_A_brick;
+  density_fft = density_A_fft;
+
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  brick2fft();
+
+  // group B
+
+  density_brick = density_B_brick;
+  density_fft = density_B_fft;
+
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+		   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  brick2fft();
+
+  // switch back pointers
+
+  density_brick = density_brick_real;
+  density_fft = density_fft_real;
+
+  // compute potential gradient on my FFT grid and
+  //   portion of group-group energy/force on this proc's FFT grid
+
+  poisson_groups(AA_flag);
+
+  const double qscale = qqrd2e * scale;
+
+  // total group A <--> group B energy
+  // self and boundary correction terms are in compute_group_group.cpp
+
+  double e2group_all;
+  MPI_Allreduce(&e2group,&e2group_all,1,MPI_DOUBLE,MPI_SUM,world);
+  e2group = e2group_all;
+
+  e2group *= qscale*0.5*volume;
+
+  // total group A <--> group B force
+
+  double f2group_all[3];
+  MPI_Allreduce(f2group,f2group_all,3,MPI_DOUBLE,MPI_SUM,world);
+
+  f2group[0] = qscale*volume*f2group_all[0];
+  f2group[1] = qscale*volume*f2group_all[1];
+  if (slabflag != 2) f2group[2] = qscale*volume*f2group_all[2];
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+
+  if (slabflag == 1)
+    slabcorr_groups(groupbit_A, groupbit_B, AA_flag);
+}
+
+/* ----------------------------------------------------------------------
+ allocate group-group memory that depends on # of K-vectors and order
+ ------------------------------------------------------------------------- */
+
+void PPPM2::allocate_groups()
+{
+  group_allocate_flag = 1;
+
+  memory->create3d_offset(density_A_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_A_brick");
+  memory->create3d_offset(density_B_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_B_brick");
+  memory->create(density_A_fft,nfft_both,"pppm:density_A_fft");
+  memory->create(density_B_fft,nfft_both,"pppm:density_B_fft");
+}
+
+/* ----------------------------------------------------------------------
+ deallocate group-group memory that depends on # of K-vectors and order
+ ------------------------------------------------------------------------- */
+
+void PPPM2::deallocate_groups()
+{
+  group_allocate_flag = 0;
+
+  memory->destroy3d_offset(density_A_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(density_B_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy(density_A_fft);
+  memory->destroy(density_B_fft);
+}
+
+/* ----------------------------------------------------------------------
+ create discretized "density" on section of global grid due to my particles
+ density(x,y,z) = charge "density" at grid points of my 3d brick
+ (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+ in global grid for group-group interactions
+ ------------------------------------------------------------------------- */
+
+void PPPM2::make_rho_groups(int groupbit_A, int groupbit_B, int AA_flag)
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+  // clear 3d density arrays
+
+  memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double *q = atom->q;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+  int *mask = atom->mask;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B)))
+      if (AA_flag) continue;
+
+    if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) {
+
+      nx = part2grid[i][0];
+      ny = part2grid[i][1];
+      nz = part2grid[i][2];
+      dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+      dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+      dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+      compute_rho1d(dx,dy,dz);
+
+      z0 = delvolinv * q[i];
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        y0 = z0*rho1d[2][n];
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          x0 = y0*rho1d[1][m];
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+
+            // group A
+
+            if (mask[i] & groupbit_A)
+              density_A_brick[mz][my][mx] += x0*rho1d[0][l];
+
+            // group B
+
+            if (mask[i] & groupbit_B)
+              density_B_brick[mz][my][mx] += x0*rho1d[0][l];
+          }
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for group-group interactions
+ ------------------------------------------------------------------------- */
+
+void PPPM2::poisson_groups(int AA_flag)
+{
+  int i,j,k,n;
+
+  // reuse memory (already declared)
+
+  FFT_SCALAR *work_A = work1;
+  FFT_SCALAR *work_B = work2;
+
+  // transform charge density (r -> k)
+
+  // group A
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work_A[n++] = density_A_fft[i];
+    work_A[n++] = ZEROF;
+  }
+
+  fft1->compute(work_A,work_A,1);
+
+  // group B
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work_B[n++] = density_B_fft[i];
+    work_B[n++] = ZEROF;
+  }
+
+  fft1->compute(work_B,work_B,1);
+
+  // group-group energy and force contribution,
+  //  keep everything in reciprocal space so
+  //  no inverse FFTs needed
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  // energy
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    e2group += s2 * greensfn[i] *
+      (work_A[n]*work_B[n] + work_A[n+1]*work_B[n+1]);
+    n += 2;
+  }
+
+  if (AA_flag) return;
+
+
+  // multiply by Green's function and s2
+  //  (only for work_A so it is not squared below)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work_A[n++] *= s2 * greensfn[i];
+    work_A[n++] *= s2 * greensfn[i];
+  }
+
+  // triclinic system
+
+  if (triclinic) {
+    poisson_groups_triclinic();
+    return;
+  }
+
+  double partial_group;
+
+  // force, x direction
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+        f2group[0] += fkx[i] * partial_group;
+        n += 2;
+      }
+
+  // force, y direction
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+        f2group[1] += fky[j] * partial_group;
+        n += 2;
+      }
+
+  // force, z direction
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+        f2group[2] += fkz[k] * partial_group;
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for group-group interactions
+   for a triclinic system
+ ------------------------------------------------------------------------- */
+
+void PPPM2::poisson_groups_triclinic()
+{
+  int i,n;
+
+  // reuse memory (already declared)
+
+  FFT_SCALAR *work_A = work1;
+  FFT_SCALAR *work_B = work2;
+
+  double partial_group;
+
+  // force, x direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[0] += fkx[i] * partial_group;
+    n += 2;
+  }
+
+  // force, y direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[1] += fky[i] * partial_group;
+    n += 2;
+  }
+
+  // force, z direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[2] += fkz[i] * partial_group;
+    n += 2;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPM2::slabcorr_groups(int groupbit_A, int groupbit_B, int AA_flag)
+{
+  // compute local contribution to global dipole moment
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double qsum_A = 0.0;
+  double qsum_B = 0.0;
+  double dipole_A = 0.0;
+  double dipole_B = 0.0;
+  double dipole_r2_A = 0.0;
+  double dipole_r2_B = 0.0;
+
+  for (int i = 0; i < nlocal; i++) {
+    if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B)))
+      if (AA_flag) continue;
+
+    if (mask[i] & groupbit_A) {
+      qsum_A += q[i];
+      dipole_A += q[i]*x[i][2];
+      dipole_r2_A += q[i]*x[i][2]*x[i][2];
+    }
+
+    if (mask[i] & groupbit_B) {
+      qsum_B += q[i];
+      dipole_B += q[i]*x[i][2];
+      dipole_r2_B += q[i]*x[i][2]*x[i][2];
+    }
+  }
+
+  // sum local contributions to get total charge and global dipole moment
+  //  for each group
+
+  double tmp;
+  MPI_Allreduce(&qsum_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsum_A = tmp;
+
+  MPI_Allreduce(&qsum_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsum_B = tmp;
+
+  MPI_Allreduce(&dipole_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_A = tmp;
+
+  MPI_Allreduce(&dipole_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_B = tmp;
+
+  MPI_Allreduce(&dipole_r2_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_r2_A = tmp;
+
+  MPI_Allreduce(&dipole_r2_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  dipole_r2_B = tmp;
+
+  // compute corrections
+
+  const double qscale = qqrd2e * scale;
+  const double efact = qscale * MY_2PI/volume;
+
+  e2group += efact * (dipole_A*dipole_B - 0.5*(qsum_A*dipole_r2_B +
+    qsum_B*dipole_r2_A) - qsum_A*qsum_B*zprd*zprd/12.0);
+
+  // add on force corrections
+
+  const double ffact = qscale * (-4.0*MY_PI/volume);
+  f2group[2] += ffact * (qsum_A*dipole_B - qsum_B*dipole_A);
+}
diff --git a/src/KSPACE/pppm2.h b/src/KSPACE/pppm2.h
new file mode 100644
index 0000000000..31b8534735
--- /dev/null
+++ b/src/KSPACE/pppm2.h
@@ -0,0 +1,360 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm2,PPPM2)
+
+#else
+
+#ifndef LMP_PPPM2_H
+#define LMP_PPPM2_H
+
+#include "kspace.h"
+
+#if defined(FFT_FFTW3)
+#define LMP_FFT_LIB "FFTW3"
+#elif defined(FFT_MKL)
+#define LMP_FFT_LIB "MKL FFT"
+#elif defined(FFT_CUFFT)
+#define LMP_FFT_LIB "cuFFT"
+#else
+#define LMP_FFT_LIB "KISS FFT"
+#endif
+
+#ifdef FFT_SINGLE
+typedef float FFT_SCALAR;
+#define LMP_FFT_PREC "single"
+#define MPI_FFT_SCALAR MPI_FLOAT
+#else
+
+typedef double FFT_SCALAR;
+#define LMP_FFT_PREC "double"
+#define MPI_FFT_SCALAR MPI_DOUBLE
+#endif
+
+namespace LAMMPS_NS {
+
+class PPPM2 : public KSpace {
+ public:
+  PPPM2(class LAMMPS *);
+  virtual ~PPPM2();
+  virtual void settings(int, char **);
+  virtual void init();
+  virtual void setup();
+  virtual void setup_grid();
+  virtual void compute(int, int);
+  virtual int timing_1d(int, double &);
+  virtual int timing_3d(int, double &);
+  virtual double memory_usage();
+
+  virtual void compute_group_group(int, int, int);
+
+ protected:
+  int me,nprocs;
+  int nfactors;
+  int *factors;
+  double cutoff;
+  double volume;
+  double delxinv,delyinv,delzinv,delvolinv;
+  double h_x,h_y,h_z;
+  double shift,shiftone;
+  int peratom_allocate_flag;
+
+  int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in;
+  int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out;
+  int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost;
+  int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft;
+  int nlower,nupper;
+  int ngrid,nfft,nfft_both;
+
+  FFT_SCALAR ***density_brick;
+  FFT_SCALAR ***vdx_brick,***vdy_brick,***vdz_brick;
+  FFT_SCALAR ***u_brick;
+  FFT_SCALAR ***v0_brick,***v1_brick,***v2_brick;
+  FFT_SCALAR ***v3_brick,***v4_brick,***v5_brick;
+  double *greensfn;
+  double **vg;
+  double *fkx,*fky,*fkz;
+  FFT_SCALAR *density_fft;
+  FFT_SCALAR *work1,*work2;
+
+  double *gf_b;
+  FFT_SCALAR **rho1d,**rho_coeff,**drho1d,**drho_coeff;
+  double *sf_precoeff1, *sf_precoeff2, *sf_precoeff3;
+  double *sf_precoeff4, *sf_precoeff5, *sf_precoeff6;
+  double sf_coeff[6];          // coefficients for calculating ad self-forces
+  double **acons;
+
+  // FFTs and grid communication
+
+  class FFT3d *fft1,*fft2;
+  class Remap *remap;
+  class GridComm2 *gc;
+  FFT_SCALAR *gc_buf1,*gc_buf2;
+  int ngc_buf1,ngc_buf2,npergrid;
+
+  // group-group interactions
+
+  int group_allocate_flag;
+  FFT_SCALAR ***density_A_brick,***density_B_brick;
+  FFT_SCALAR *density_A_fft,*density_B_fft;
+
+  int **part2grid;             // storage for particle -> grid mapping
+  int nmax;
+
+  double *boxlo;
+                               // TIP4P settings
+  int typeH,typeO;             // atom types of TIP4P water H and O atoms
+  double qdist;                // distance from O site to negative charge
+  double alpha;                // geometric factor
+
+  virtual void set_grid_global();
+  void set_grid_local();
+  void adjust_gewald();
+  virtual double newton_raphson_f();
+  double derivf();
+  double final_accuracy();
+
+  virtual void allocate();
+  virtual void allocate_peratom();
+  virtual void deallocate();
+  virtual void deallocate_peratom();
+  int factorable(int);
+  double compute_df_kspace();
+  double estimate_ik_error(double, double, bigint);
+  virtual double compute_qopt();
+  virtual void compute_gf_denom();
+  virtual void compute_gf_ik();
+  virtual void compute_gf_ad();
+  void compute_sf_precoeff();
+
+  virtual void particle_map();
+  virtual void make_rho();
+  virtual void brick2fft();
+
+  virtual void poisson();
+  virtual void poisson_ik();
+  virtual void poisson_ad();
+
+  virtual void fieldforce();
+  virtual void fieldforce_ik();
+  virtual void fieldforce_ad();
+
+  virtual void poisson_peratom();
+  virtual void fieldforce_peratom();
+  void procs2grid2d(int,int,int,int *, int*);
+  void compute_rho1d(const FFT_SCALAR &, const FFT_SCALAR &,
+                     const FFT_SCALAR &);
+  void compute_drho1d(const FFT_SCALAR &, const FFT_SCALAR &,
+                     const FFT_SCALAR &);
+  void compute_rho_coeff();
+  virtual void slabcorr();
+
+  // grid communication
+
+  virtual void pack_forward2(int, void *, int, int *);
+  virtual void unpack_forward2(int, void *, int, int *);
+  virtual void pack_reverse2(int, void *, int, int *);
+  virtual void unpack_reverse2(int, void *, int, int *);
+
+  // triclinic
+
+  int triclinic;               // domain settings, orthog or triclinic
+  void setup_triclinic();
+  void compute_gf_ik_triclinic();
+  void poisson_ik_triclinic();
+  void poisson_groups_triclinic();
+
+  // group-group interactions
+
+  virtual void allocate_groups();
+  virtual void deallocate_groups();
+  virtual void make_rho_groups(int, int, int);
+  virtual void poisson_groups(int);
+  virtual void slabcorr_groups(int,int,int);
+
+/* ----------------------------------------------------------------------
+   denominator for Hockney-Eastwood Green's function
+     of x,y,z = sin(kx*deltax/2), etc
+
+            inf                 n-1
+   S(n,k) = Sum  W(k+pi*j)**2 = Sum b(l)*(z*z)**l
+           j=-inf               l=0
+
+          = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x)  at z = sin(x)
+   gf_b = denominator expansion coeffs
+------------------------------------------------------------------------- */
+
+  inline double gf_denom(const double &x, const double &y,
+                         const double &z) const {
+    double sx,sy,sz;
+    sz = sy = sx = 0.0;
+    for (int l = order-1; l >= 0; l--) {
+      sx = gf_b[l] + sx*x;
+      sy = gf_b[l] + sy*y;
+      sz = gf_b[l] + sz*z;
+    }
+    double s = sx*sy*sz;
+    return s*s;
+  };
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Must redefine kspace_style after changing to triclinic box
+
+UNDOCUMENTED
+
+E: Cannot (yet) use PPPM with triclinic box and kspace_modify diff ad
+
+This feature is not yet supported.
+
+E: Cannot (yet) use PPPM with triclinic box and slab correction
+
+This feature is not yet supported.
+
+E: Cannot use PPPM with 2d simulation
+
+The kspace style pppm cannot be used in 2d simulations.  You can use
+2d PPPM in a 3d simulation; see the kspace_modify command.
+
+E: PPPM can only currently be used with comm_style brick
+
+This is a current restriction in LAMMPS.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use non-periodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range Coulombic or dispersion components be used.
+
+E: Pair style is incompatible with TIP4P KSpace style
+
+The pair style does not have the requires TIP4P settings.
+
+E: Bond and angle potentials must be defined for TIP4P
+
+Cannot use TIP4P pair potential unless bond and angle potentials
+are defined.
+
+E: Bad TIP4P angle type for PPPM/TIP4P
+
+Specified angle type is not valid.
+
+E: Bad TIP4P bond type for PPPM/TIP4P
+
+Specified bond type is not valid.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired.  See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2.  This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Must use kspace_modify gewald for uncharged system
+
+UNDOCUMENTED
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy.  This error should not occur for typical problems.  Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096.  You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald.  This error should not occur for typical problems.  Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor.  This is likely for one of two
+reasons, both of them bad.  First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors.  This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command.  The safest settings are "delay 0
+every 1 check yes".  Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems
+
+This option is not yet supported.
+
+E: Cannot (yet) use kspace_modify diff ad with compute group/group
+
+This option is not yet supported.
+
+U: Cannot (yet) use PPPM with triclinic box and TIP4P
+
+This feature is not yet supported.
+
+*/
diff --git a/src/force.cpp b/src/force.cpp
index d2eb137d06..595ffd3140 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -665,9 +665,9 @@ void Force::create_kspace(const std::string &style, int trysuffix)
   kspace = new_kspace(style,trysuffix,sflag);
   store_style(kspace_style,style,sflag);
 
-  if (comm->style == 1 && !kspace_match("ewald",0))
-    error->all(FLERR,
-               "Cannot yet use KSpace solver with grid with comm style tiled");
+  //if (comm->style == 1 && !kspace_match("ewald",0))
+  // error->all(FLERR,
+  //             "Cannot yet use KSpace solver with grid with comm style tiled");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/kspace.h b/src/kspace.h
index e26b9490d6..4d686d59f0 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -126,6 +126,11 @@ class KSpace : protected Pointers {
   virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {};
   virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {};
 
+  virtual void pack_forward2(int, void *, int, int *) {};
+  virtual void unpack_forward2(int, void *, int, int *) {};
+  virtual void pack_reverse2(int, void *, int, int *) {};
+  virtual void unpack_reverse2(int, void *, int, int *) {};
+
   virtual int timing(int, double &, double &) {return 0;}
   virtual int timing_1d(int, double &) {return 0;}
   virtual int timing_3d(int, double &) {return 0;}