lammps/src/grid2d.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

#include "grid2d.h"

#include "comm.h"
#include "domain.h"
#include "error.h"
#include "fix.h"
#include "irregular.h"
#include "kspace.h"
#include "math_extra.h"
#include "memory.h"
#include "pair.h"

#include <cstring>

using namespace LAMMPS_NS;

static constexpr int DELTA = 16;

static constexpr int OFFSET = 16384;

/* ----------------------------------------------------------------------
   NOTES:
   if o indices for ghosts are < 0 or hi indices are >= N,
     then grid is treated as periodic in that dimension,
     comm is done across the periodic boundaries
   tiled implementations only work for RCB, not general tilings
     b/c RCB tree is used to find neighboring tiles
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   constructor to create a 2d distributed grid
   Grid3d assigns owned/ghost cells to each proc via setup_grid()
     it MUST be called after constructor
   gcomm = caller's communicator
   gnx,gny,gnz = global grid size
------------------------------------------------------------------------- */

Grid2d::Grid2d(LAMMPS *lmp, MPI_Comm gcomm, int gnx, int gny) :
    Pointers(lmp), swap(nullptr), requests(nullptr), srequest(nullptr), rrequest(nullptr),
    sresponse(nullptr), rresponse(nullptr), send(nullptr), recv(nullptr), copy(nullptr),
    send_remap(nullptr), recv_remap(nullptr), overlap_procs(nullptr), xsplit(nullptr),
    ysplit(nullptr), zsplit(nullptr), grid2proc(nullptr), rcbinfo(nullptr), overlap_list(nullptr)
{
  gridcomm = gcomm;
  MPI_Comm_rank(gridcomm, &me);
  MPI_Comm_size(gridcomm, &nprocs);

  nx = gnx;
  ny = gny;

  // default settings, can be overridden by set() methods
  // these affect assignment of owned and ghost cells

  maxdist = 0.0;
  stencil_grid_lo = stencil_grid_hi = 0;
  stencil_atom_lo = stencil_atom_hi = 0;
  shift_grid = 0.5;
  shift_atom_lo = shift_atom_hi = 0.0;
  yextra = 0;
  yfactor = 1.0;
}

/* ----------------------------------------------------------------------
   alternate constructor to create a 2d distributed grid
   caller assigns owned/ghost cells to each proc
     setup_grid() must NOT be called
   gcomm = caller's communicator
   gnx,gny = global grid size
   i xy lo/hi = extent of owned grid cells on this proc
   o xy lo/hi = extent of owned+ghost grid cells on this proc
   owned and ghost indices are inclusive
     owned indices range from 0 to N-1
     ghost indices can extend < 0 or >= N
------------------------------------------------------------------------- */

Grid2d::Grid2d(LAMMPS *lmp, MPI_Comm gcomm, int gnx, int gny, int ixlo, int ixhi, int iylo,
               int iyhi, int oxlo, int oxhi, int oylo, int oyhi) :
    Pointers(lmp), swap(nullptr), requests(nullptr), srequest(nullptr), rrequest(nullptr),
    sresponse(nullptr), rresponse(nullptr), send(nullptr), recv(nullptr), copy(nullptr),
    send_remap(nullptr), recv_remap(nullptr), overlap_procs(nullptr), xsplit(nullptr),
    ysplit(nullptr), zsplit(nullptr), grid2proc(nullptr), rcbinfo(nullptr), overlap_list(nullptr)
{
  gridcomm = gcomm;
  MPI_Comm_rank(gridcomm, &me);
  MPI_Comm_size(gridcomm, &nprocs);

  nx = gnx;
  ny = gny;

  // store owned/ghost indices provided by caller

  inxlo = ixlo;
  inxhi = ixhi;
  inylo = iylo;
  inyhi = iyhi;

  outxlo = oxlo;
  outxhi = oxhi;
  outylo = oylo;
  outyhi = oyhi;

  // additional intialization
  // other constructor invokes this from setup_grid()

  Grid2d::initialize();
}

/* ---------------------------------------------------------------------- */

Grid2d::~Grid2d()
{
  // brick comm data structs

  for (int i = 0; i < nswap; i++) {
    memory->destroy(swap[i].packlist);
    memory->destroy(swap[i].unpacklist);
  }
  memory->sfree(swap);

  delete[] xsplit;
  delete[] ysplit;
  memory->destroy(grid2proc);

  // tiled comm data structs

  for (int i = 0; i < nsend; i++) memory->destroy(send[i].packlist);
  memory->sfree(send);

  for (int i = 0; i < nrecv; i++) memory->destroy(recv[i].unpacklist);
  memory->sfree(recv);

  for (int i = 0; i < ncopy; i++) {
    memory->destroy(copy[i].packlist);
    memory->destroy(copy[i].unpacklist);
  }
  memory->sfree(copy);

  delete[] requests;
  delete[] requests_remap;

  memory->sfree(rcbinfo);

  // remap data structs

  deallocate_remap();
}

// ----------------------------------------------------------------------
// set Grid parameters
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   maxdist = max distance outside proc subdomain a particle can be
   used to determine extent of ghost cells
------------------------------------------------------------------------- */

void Grid2d::set_distance(double distance)
{
  maxdist = distance;
}

/* ----------------------------------------------------------------------
   # of grid cells beyond an owned grid cell that caller accesses
   can be different in lo vs hi direction
------------------------------------------------------------------------- */

void Grid2d::set_stencil_grid(int lo, int hi)
{
  stencil_grid_lo = lo;
  stencil_grid_hi = hi;
}

/* ----------------------------------------------------------------------
   # of grid cells beyond a particle's grid cell that caller accesses
   can be different in lo vs hi direction
------------------------------------------------------------------------- */

void Grid2d::set_stencil_atom(int lo, int hi)
{
  stencil_atom_lo = lo;
  stencil_atom_hi = hi;
}

/* ----------------------------------------------------------------------
   shift_grid = offset within grid cell of position of grid point
   0.5 = cell center (default), 0.0 = lower-left corner of cell
   used to decide which proc owns a grid cell (grid pt within subdomain)
------------------------------------------------------------------------- */

void Grid2d::set_shift_grid(double shift)
{
  shift_grid = shift;
}

/* ----------------------------------------------------------------------
   shift_atom = offset added to atoms when caller maps them to grid cells
   0.5 = half a grid cell, 0.0 = no offset
   used to compute maximum possible ghost extents
   use of lo/hi allows max ghost extent on each side to be different
------------------------------------------------------------------------- */

void Grid2d::set_shift_atom(double shift_lo, double shift_hi)
{
  shift_atom_lo = shift_lo;
  shift_atom_hi = shift_hi;
}

/* ----------------------------------------------------------------------
   enable extra grid cells in Y
   factor = muliplication factor on box size Y and thus grid size
   factor > 1.0 when grid extends beyond Y box size (3.0 = tripled in size)
   only enable yextra if factor > 1.0
   default zextra = 0, factor = 1.0 (no extra grid cells in Y)
------------------------------------------------------------------------- */

void Grid2d::set_yfactor(double factor)
{
  if (factor == 1.0)
    yextra = 0;
  else
    yextra = 1;
  yfactor = factor;
}

/* ----------------------------------------------------------------------
   set IDs of proc neighbors used in uniform local owned/ghost comm
   must be called BEFORE setup_comm() to override
     the processor neighbors stored by extract_comm_info()
------------------------------------------------------------------------- */

void Grid2d::set_proc_neighs(int pxlo, int pxhi, int pylo, int pyhi)
{
  procxlo = pxlo;
  procxhi = pxhi;
  procylo = pylo;
  procyhi = pyhi;
}

/* ----------------------------------------------------------------------
   set allocation dimensions of caller grid used by indices() to setup pack/unpack
   must be called BEFORE setup_comm() to override
     the caller grid size used in indices()
------------------------------------------------------------------------- */

void Grid2d::set_caller_grid(int fxlo, int fxhi, int fylo, int fyhi)
{
  fullxlo = fxlo;
  fullxhi = fxhi;
  fullylo = fylo;
  fullyhi = fyhi;
}

// ----------------------------------------------------------------------
// retrieve Grid parameters
// ----------------------------------------------------------------------

int Grid2d::identical(Grid2d *grid2)
{
  int inxlo2, inxhi2, inylo2, inyhi2;
  int outxlo2, outxhi2, outylo2, outyhi2;

  grid2->get_bounds_owned(inxlo2, inxhi2, inylo2, inyhi2);
  grid2->get_bounds_ghost(outxlo2, outxhi2, outylo2, outyhi2);

  int flag = 0;
  if ((inxlo != inxlo2) || (inxhi != inxhi2) || (inylo != inylo2) || (inyhi != inyhi2)) flag = 1;
  if ((outxlo != outxlo2) || (outxhi != outxhi2) || (outylo != outylo2) || (outyhi != outyhi2))
    flag = 1;

  int flagall;
  MPI_Allreduce(&flag, &flagall, 1, MPI_INT, MPI_SUM, gridcomm);

  if (flagall) return 0;
  return 1;
}

/* ---------------------------------------------------------------------- */

void Grid2d::get_size(int &nxgrid, int &nygrid)
{
  nxgrid = nx;
  nygrid = ny;
}

/* ---------------------------------------------------------------------- */

void Grid2d::get_bounds_owned(int &xlo, int &xhi, int &ylo, int &yhi)
{
  xlo = inxlo;
  xhi = inxhi;
  ylo = inylo;
  yhi = inyhi;
}

/* ---------------------------------------------------------------------- */

void Grid2d::get_bounds_ghost(int &xlo, int &xhi, int &ylo, int &yhi)
{
  xlo = outxlo;
  xhi = outxhi;
  ylo = outylo;
  yhi = outyhi;
}

// ----------------------------------------------------------------------
// define owned and ghost grid cells
// also store comm and grid partitioning info
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   setup grid partition for each proc = owned + ghost cells
   return:
     i xy lohi = portion of global grid this proc owns, 0 <= index < N
     o xy lohi = owned + ghost grid cells in all directions
   for periodic dims, o indices can be < 0 or >= N
   for non-periodic dims, o indices will be >= 0 and < N
     since no grid comm is done across non-periodic boundaries
------------------------------------------------------------------------- */

void Grid2d::setup_grid(int &ixlo, int &ixhi, int &iylo, int &iyhi, int &oxlo, int &oxhi, int &oylo,
                        int &oyhi)
{
  // owned grid cells = those whose grid point is within proc subdomain
  // shift_grid = 0.5 for grid point at cell center, 0.0 for lower-left corner

  double fraclo, frachi;

  if (comm->layout != Comm::LAYOUT_TILED) {
    fraclo = comm->xsplit[comm->myloc[0]];
    frachi = comm->xsplit[comm->myloc[0] + 1];
    partition_grid(nx, fraclo, frachi, shift_grid, 0, inxlo, inxhi);
    fraclo = comm->ysplit[comm->myloc[1]];
    frachi = comm->ysplit[comm->myloc[1] + 1];
    partition_grid(ny, fraclo, frachi, shift_grid, yextra, inylo, inyhi);
  } else {
    fraclo = comm->mysplit[0][0];
    frachi = comm->mysplit[0][1];
    partition_grid(nx, fraclo, frachi, shift_grid, 0, inxlo, inxhi);
    fraclo = comm->mysplit[1][0];
    frachi = comm->mysplit[1][1];
    partition_grid(ny, fraclo, frachi, shift_grid, yextra, inylo, inyhi);
  }

  // extend owned grid bounds with ghost grid cells in each direction

  ghost_grid();

  // additional intialization
  // other constructor invokes this directly

  initialize();

  // return values

  ixlo = inxlo;
  ixhi = inxhi;
  iylo = inylo;
  iyhi = inyhi;

  oxlo = outxlo;
  oxhi = outxhi;
  oylo = outylo;
  oyhi = outyhi;
}

/* ----------------------------------------------------------------------
   additional one-time setup common to both constructors
 ---------------------------------------------------------------------- */

void Grid2d::initialize()
{
  // error check on size of grid stored by this proc

  bigint total = (bigint) (outxhi - outxlo + 1) * (outyhi - outylo + 1);
  if (total > MAXSMALLINT) error->one(FLERR, "Too many owned+ghost grid2d points");

  // default = caller grid is allocated to ghost grid
  // used when computing pack/unpack lists in indices()
  // these values can be overridden using set_caller_grid()

  fullxlo = outxlo;
  fullxhi = outxhi;
  fullylo = outylo;
  fullyhi = outyhi;

  // initialize data structs

  nswap = maxswap = 0;
  swap = nullptr;

  nsend = nrecv = ncopy = 0;
  send = nullptr;
  recv = nullptr;
  copy = nullptr;
  requests = nullptr;
  requests_remap = nullptr;

  xsplit = ysplit = zsplit = nullptr;
  grid2proc = nullptr;
  rcbinfo = nullptr;

  nsend_remap = nrecv_remap = self_remap = 0;
  send_remap = nullptr;
  recv_remap = nullptr;

  // store info about Comm decomposition needed for remap operation
  // two Grid instances will exist for duration of remap
  // each must know Comm decomp at time Grid instance was created

  extract_comm_info();
}

/* ----------------------------------------------------------------------
   partition a global regular grid into one brick-shaped sub-grid per proc
   if grid point is inside my sub-domain I own it,
     this includes sub-domain lo boundary but excludes hi boundary
   ngrid = extent of global grid in a dimension
     indices into the global grid range from 0 to Ngrid-1 in that dim
   shift determines position of grid pt within grid cell
     shift = 0.5 for cell center, 0.0 for lower-left corner
   extra = 0 if grid exactly covers the simulation box
   extra = 1 if grid extends beyond the +y boundary by yfactor
     effectively maps proc partitions to the box-size subset of the grid
   lo/hi = inclusive lo/hi bounds for brick of global grid cells I own
     lo grid index = first grid pt >= fraclo*bound
     hi grid index = last grid pt < frachi*bound
     if proc owns no grid cells in a dim, then inlo > inhi
   special case: 2 procs share boundary which a grid point is exactly on
     2 if test equalties ensure a consistent decision as to which proc owns it
------------------------------------------------------------------------- */

void Grid2d::partition_grid(int ngrid, double fraclo, double frachi, double shift, int extra,
                            int &lo, int &hi)
{
  if (extra == 0) {
    lo = static_cast<int>(fraclo * ngrid);
    while (lo + shift < fraclo * ngrid) lo++;
    hi = static_cast<int>(frachi * ngrid);
    while (hi + shift >= frachi * ngrid) hi--;
  } else {
    lo = static_cast<int>(fraclo * ngrid / yfactor);
    while (lo + shift < fraclo * ngrid / yfactor) lo++;
    hi = static_cast<int>(frachi * ngrid / yfactor);
    while (hi + shift >= frachi * ngrid / yfactor) hi--;
  }
}

// clang-format off
/* ----------------------------------------------------------------------
   extend ghost grid cells in each direction beyond owned grid
   indices into the global grid range from 0 to N-1 in each dim
   ghost cell indices for periodic systems can be < 0 or >= N
------------------------------------------------------------------------- */

void Grid2d::ghost_grid()
{
  double *prd,*boxlo,*sublo,*subhi;
  int triclinic = domain->triclinic;

  if (triclinic == 0) {
    prd = domain->prd;
    boxlo = domain->boxlo;
    sublo = domain->sublo;
    subhi = domain->subhi;
  } else {
    prd = domain->prd_lamda;
    boxlo = domain->boxlo_lamda;
    sublo = domain->sublo_lamda;
    subhi = domain->subhi_lamda;
  }

  // for triclinic, maxdist = different value in each orthogonal direction

  double dist[3] = {0.0,0.0,0.0};
  if (triclinic == 0) dist[0] = dist[1] = dist[2] = maxdist;
  else MathExtra::tribbox(domain->h,maxdist,&dist[0]);

  // lo/hi = min/max index of global grid cells my owned atoms can be mapped to
  //   includes effects of maxdist and shift_atom settings
  // lo/hi can be further extended by stencil_atom and stencil_grid settings
  // all those settings are set by caller
  // ghost cell layers needed in each dim/dir = max of two extension effects
  // OFFSET allows generation of negative indices with static_cast
  // out xyz lo/hi = index range of owned + ghost cells
  // if yextra, ny and effective prd[1] are both larger, so dyinv is the same

  double dxinv = nx / prd[0];
  double dyinv = ny / prd[1];
  if (yextra) dyinv = ny / (prd[1] * yfactor);

  int lo, hi;

  lo = static_cast<int>((sublo[0]-dist[0]-boxlo[0]) * dxinv + shift_atom_lo + OFFSET) - OFFSET;
  hi = static_cast<int>((subhi[0]+dist[0]-boxlo[0]) * dxinv + shift_atom_hi + OFFSET) - OFFSET;
  outxlo = MIN(lo - stencil_atom_lo, inxlo - stencil_grid_lo);
  outxhi = MAX(hi + stencil_atom_hi, inxhi + stencil_grid_hi);

  lo = static_cast<int>((sublo[1]-dist[1]-boxlo[1]) * dyinv + shift_atom_lo + OFFSET) - OFFSET;
  hi = static_cast<int>((subhi[1]+dist[1]-boxlo[1]) * dyinv + shift_atom_hi + OFFSET) - OFFSET;
  outylo = MIN(lo - stencil_atom_lo, inylo - stencil_grid_lo);
  outyhi = MAX(hi + stencil_atom_hi, inyhi + stencil_grid_hi);

  // if yextra = 1:
  //   adjust grid boundaries for processors at +y end,
  //   to include added empty grid cells between periodically repeating slabs
  // in this case:
  //   want grid data forward communicated from +y proc to -y proc, but not vice versa
  //   want grid data reverse communicated from -y proc to +y proc, but not vice versa
  // this is accomplished by inyhi = outyhi on +y end (no ghost cells)
  // also ensure no other procs use ghost cells beyond +y limit

  if (yextra) {
    if (comm->layout != Comm::LAYOUT_TILED) {
      if (comm->myloc[1] == comm->procgrid[1]-1) inyhi = outyhi = ny - 1;
    } else {
      if (comm->mysplit[1][1] == 1.0) inyhi = outyhi = ny - 1;
    }
    outyhi = MIN(outyhi,ny-1);
  }

  // limit out xyz lo/hi indices to global grid for non-periodic dims
  // if yextra = 1, treat y-dimension as if periodic

  int *periodicity = domain->periodicity;

  if (!periodicity[0]) {
    outxlo = MAX(0,outxlo);
    outxhi = MIN(nx-1,outxhi);
  }

  if (!periodicity[1] && !yextra) {
    outylo = MAX(0,outylo);
    outyhi = MIN(ny-1,outyhi);
  }
}

/* ----------------------------------------------------------------------
   store copy of info from Comm class about processor partitioning
   used when a remap is performed between two Grid instances, old and new
------------------------------------------------------------------------- */

void Grid2d::extract_comm_info()
{
  // for non TILED layout:
  // proc xyz lohi = my 4 neighbor procs in this MPI_Comm
  //   these proc IDs can be overridden by caller using set_proc_neighs()
  // xyz split = copy of 1d vectors in Comm
  // grid2proc = copy of 3d array in Comm

  if (comm->layout != Comm::LAYOUT_TILED) {
    procxlo = comm->procneigh[0][0];
    procxhi = comm->procneigh[0][1];
    procylo = comm->procneigh[1][0];
    procyhi = comm->procneigh[1][1];

    xsplit = new double[comm->procgrid[0]+1];
    ysplit = new double[comm->procgrid[1]+1];
    memcpy(xsplit, comm->xsplit, sizeof(double) * (comm->procgrid[0]+1));
    memcpy(ysplit, comm->ysplit, sizeof(double) * (comm->procgrid[1]+1));

    memory->create(grid2proc,comm->procgrid[0],comm->procgrid[1],comm->procgrid[2],
                   "grid3d:grid2proc");
    memcpy(&grid2proc[0][0][0],&comm->grid2proc[0][0][0],
           sizeof(int) * comm->procgrid[0] * comm->procgrid[1] * comm->procgrid[2]);
  }

  // for TILED layout:
  // create RCB tree of grid partitioning info for grid decomp
  // Comm provides dim info for this proc, stored as RCBinfo.dim
  //   dim is -1 for proc 0, but never accessed
  // RCBinfo.cut = this proc's inlo in that dim
  // Allgather creates the tree of dims and cuts

  if (comm->layout == Comm::LAYOUT_TILED) {
    rcbinfo = (RCBinfo *)
      memory->smalloc(nprocs*sizeof(RCBinfo),"grid3d:rcbinfo");
    RCBinfo rcbone;
    rcbone.dim = comm->rcbcutdim;
    if (rcbone.dim <= 0) rcbone.cut = inxlo;
    else if (rcbone.dim == 1) rcbone.cut = inylo;
    MPI_Allgather(&rcbone,sizeof(RCBinfo),MPI_CHAR,
                  rcbinfo,sizeof(RCBinfo),MPI_CHAR,gridcomm);
  }
}

// ----------------------------------------------------------------------
// setup of local owned/ghost grid comm
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   setup communication of owned/ghost grid cells
     either for brick decomp or tiled decomp
   return sizes of two buffers needed for communication
     nbuf1 = largest pack or unpack in any Send or Recv or Copy
     nbuf2 = larget of sum of all packs or unpacks in Send or Recv
   for brick comm, nbuf1 = nbuf2
   for tiling comm, nbuf2 >= nbuf2
   nbuf1,nbuf2 are counts of grid cells
     caller converts them to message sizes for grid data it stores
------------------------------------------------------------------------- */

void Grid2d::setup_comm(int &nbuf1, int &nbuf2)
{
  if (comm->layout != Comm::LAYOUT_TILED) setup_comm_brick(nbuf1,nbuf2);
  else setup_comm_tiled(nbuf1,nbuf2);
}

/* ----------------------------------------------------------------------
   setup owned/ghost comm for brick comm
   each proc has 4 neighbors
   comm pattern = series of swaps with one of those 4 procs
   can be multiple swaps with same proc if ghost extent is large
   swap may not be symmetric if both procs do not need same layers of ghosts
   all procs perform same # of swaps in a direction, even if some don't need it
------------------------------------------------------------------------- */

void Grid2d::setup_comm_brick(int &nbuf1, int &nbuf2)
{
  int nsent,sendfirst,sendlast,recvfirst,recvlast;
  int sendplanes,recvplanes;
  int notdoneme,notdone;

  // notify 4 neighbor procs how many ghost grid planes I need from them
  // ghost xy lo = # of my lower grid planes that proc xy lo needs as its ghosts
  // ghost xy hi = # of my upper grid planes that proc xy hi needs as its ghosts
  // if this proc is its own neighbor across periodic bounary, value is from self

  int nplanes = inxlo - outxlo;
  if (procxlo != me)
      MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0,
                   &ghostxhi,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
  else ghostxhi = nplanes;

  nplanes = outxhi - inxhi;
  if (procxhi != me)
      MPI_Sendrecv(&nplanes,1,MPI_INT,procxhi,0,
                   &ghostxlo,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
  else ghostxlo = nplanes;

  nplanes = inylo - outylo;
  if (procylo != me)
    MPI_Sendrecv(&nplanes,1,MPI_INT,procylo,0,
                 &ghostyhi,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
  else ghostyhi = nplanes;

  nplanes = outyhi - inyhi;
  if (procyhi != me)
    MPI_Sendrecv(&nplanes,1,MPI_INT,procyhi,0,
                 &ghostylo,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
  else ghostylo = nplanes;

  // setup swaps = exchange of grid data with one of 6 neighobr procs
  // can be more than one in a direction if ghost region extends beyond neigh proc
  // all procs have same swap count, but swapsize npack/nunpack can be empty

  nswap = 0;

  // send own grid pts to -x processor, recv ghost grid pts from +x processor

  nsent = 0;
  sendfirst = inxlo;
  sendlast = inxhi;
  recvfirst = inxhi+1;
  notdone = 1;

  while (notdone) {
    if (nswap == maxswap) grow_swap();

    swap[nswap].sendproc = procxlo;
    swap[nswap].recvproc = procxhi;
    sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent);
    swap[nswap].npack =
      indices(swap[nswap].packlist,
              sendfirst,sendfirst+sendplanes-1,inylo,inyhi);

    if (procxlo != me)
      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxlo,0,
                   &recvplanes,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
    else recvplanes = sendplanes;

    swap[nswap].nunpack =
      indices(swap[nswap].unpacklist,
              recvfirst,recvfirst+recvplanes-1,inylo,inyhi);

    nsent += sendplanes;
    sendfirst += sendplanes;
    sendlast += recvplanes;
    recvfirst += recvplanes;
    nswap++;

    if (nsent < ghostxlo) notdoneme = 1;
    else notdoneme = 0;
    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
  }

  // send own grid pts to +x processor, recv ghost grid pts from -x processor

  nsent = 0;
  sendfirst = inxlo;
  sendlast = inxhi;
  recvlast = inxlo-1;
  notdone = 1;

  while (notdone) {
    if (nswap == maxswap) grow_swap();

    swap[nswap].sendproc = procxhi;
    swap[nswap].recvproc = procxlo;
    sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent);
    swap[nswap].npack =
      indices(swap[nswap].packlist,
              sendlast-sendplanes+1,sendlast,inylo,inyhi);

    if (procxhi != me)
      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxhi,0,
                   &recvplanes,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
    else recvplanes = sendplanes;

    swap[nswap].nunpack =
      indices(swap[nswap].unpacklist,
              recvlast-recvplanes+1,recvlast,inylo,inyhi);

    nsent += sendplanes;
    sendfirst -= recvplanes;
    sendlast -= sendplanes;
    recvlast -= recvplanes;
    nswap++;

    if (nsent < ghostxhi) notdoneme = 1;
    else notdoneme = 0;
    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
  }

  // send own grid pts to -y processor, recv ghost grid pts from +y processor

  nsent = 0;
  sendfirst = inylo;
  sendlast = inyhi;
  recvfirst = inyhi+1;
  notdone = 1;

  while (notdone) {
    if (nswap == maxswap) grow_swap();

    swap[nswap].sendproc = procylo;
    swap[nswap].recvproc = procyhi;
    sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent);
    swap[nswap].npack =
      indices(swap[nswap].packlist,
              outxlo,outxhi,sendfirst,sendfirst+sendplanes-1);

    if (procylo != me)
      MPI_Sendrecv(&sendplanes,1,MPI_INT,procylo,0,
                   &recvplanes,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
    else recvplanes = sendplanes;

    swap[nswap].nunpack =
      indices(swap[nswap].unpacklist,
              outxlo,outxhi,recvfirst,recvfirst+recvplanes-1);

    nsent += sendplanes;
    sendfirst += sendplanes;
    sendlast += recvplanes;
    recvfirst += recvplanes;
    nswap++;

    if (nsent < ghostylo) notdoneme = 1;
    else notdoneme = 0;
    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
  }

  // send own grid pts to +y processor, recv ghost grid pts from -y processor

  nsent = 0;
  sendfirst = inylo;
  sendlast = inyhi;
  recvlast = inylo-1;
  notdone = 1;

  while (notdone) {
    if (nswap == maxswap) grow_swap();

    swap[nswap].sendproc = procyhi;
    swap[nswap].recvproc = procylo;
    sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent);
    swap[nswap].npack =
      indices(swap[nswap].packlist,
              outxlo,outxhi,sendlast-sendplanes+1,sendlast);

    if (procyhi != me)
      MPI_Sendrecv(&sendplanes,1,MPI_INT,procyhi,0,
                   &recvplanes,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
    else recvplanes = sendplanes;

    swap[nswap].nunpack =
      indices(swap[nswap].unpacklist,
              outxlo,outxhi,recvlast-recvplanes+1,recvlast);

    nsent += sendplanes;
    sendfirst -= recvplanes;
    sendlast -= sendplanes;
    recvlast -= recvplanes;
    nswap++;

    if (nsent < ghostyhi) notdoneme = 1;
    else notdoneme = 0;
    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
  }

  // ngrid = max of any forward/reverse pack/unpack grid points

  int ngrid = 0;
  for (int i = 0; i < nswap; i++) {
    ngrid = MAX(ngrid,swap[i].npack);
    ngrid = MAX(ngrid,swap[i].nunpack);
  }

  nbuf1 = nbuf2 = ngrid;
}

/* ----------------------------------------------------------------------
   setup owned/ghost comm for tiled comm
   each proc has arbitrary # of neighbors that overlap its ghost extent
   identify which procs will send me ghost cells, and vice versa
   may not be symmetric if both procs do not need same layers of ghosts
   comm pattern = post recvs for all my ghosts, send my owned, wait on recvs
     no exchanges by dimension, unlike CommTiled forward/reverse comm of particles
------------------------------------------------------------------------- */

void Grid2d::setup_comm_tiled(int &nbuf1, int &nbuf2)
{
  int i,m;
  double xlo,xhi,ylo,yhi;
  int ghostbox[4],pbc[2];

  // find overlaps of my extended ghost box with all owned boxes
  // accounts for ghost box overlapping periodic boundaries
  // noverlap = # of overlaps, including self
  // overlap = vector of overlap info using Overlap data struct

  ghostbox[0] = outxlo;
  ghostbox[1] = outxhi;
  ghostbox[2] = outylo;
  ghostbox[3] = outyhi;

  pbc[0] = pbc[1] = 0;

  Overlap *overlap;
  int noverlap = compute_overlap(1,ghostbox,pbc,overlap);

  // send each proc an overlap message
  // content: me, index of my overlap, box that overlaps with its owned cells
  // ncopy = # of overlaps with myself, across a periodic boundary
  //         skip copy to self when non-PBC

  int *proclist;
  memory->create(proclist,noverlap,"grid2d:proclist");
  srequest = (Request *)
    memory->smalloc(noverlap*sizeof(Request),"grid2d:srequest");

  int nsend_request = 0;
  ncopy = 0;

  for (m = 0; m < noverlap; m++) {
    if (overlap[m].proc == me) {
      if (overlap[m].pbc[0] == 0 && overlap[m].pbc[1] == 0) continue;
      ncopy++;
    } else {
      proclist[nsend_request] = overlap[m].proc;
      srequest[nsend_request].sender = me;
      srequest[nsend_request].index = m;
      for (i = 0; i < 4; i++)
        srequest[nsend_request].box[i] = overlap[m].box[i];
      nsend_request++;
    }
  }

  auto irregular = new Irregular(lmp);
  int nrecv_request = irregular->create_data(nsend_request,proclist,1);
  auto rrequest = (Request *) memory->smalloc(nrecv_request*sizeof(Request),"grid2d:rrequest");
  irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest);
  irregular->destroy_data();

  // compute overlaps between received ghost boxes and my owned box
  // overlap box used to setup my Send data struct and respond to requests

  send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"grid2d:send");
  sresponse = (Response *) memory->smalloc(nrecv_request*sizeof(Response),"grid2d:sresponse");
  memory->destroy(proclist);
  memory->create(proclist,nrecv_request,"grid2d:proclist");

  for (m = 0; m < nrecv_request; m++) {
    send[m].proc = rrequest[m].sender;
    xlo = MAX(rrequest[m].box[0],inxlo);
    xhi = MIN(rrequest[m].box[1],inxhi);
    ylo = MAX(rrequest[m].box[2],inylo);
    yhi = MIN(rrequest[m].box[3],inyhi);
    send[m].npack = indices(send[m].packlist,xlo,xhi,ylo,yhi);

    proclist[m] = rrequest[m].sender;
    sresponse[m].index = rrequest[m].index;
    sresponse[m].box[0] = xlo;
    sresponse[m].box[1] = xhi;
    sresponse[m].box[2] = ylo;
    sresponse[m].box[3] = yhi;
  }

  nsend = nrecv_request;

  // reply to each Request message with a Response message
  // content: index for the overlap on requestor, overlap box on my owned grid

  int nsend_response = nrecv_request;
  int nrecv_response = irregular->create_data(nsend_response,proclist,1);
  auto rresponse = (Response *) memory->smalloc(nrecv_response*sizeof(Response),"grid2d:rresponse");
  irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse);
  irregular->destroy_data();
  delete irregular;

  // process received responses
  // box used to setup my Recv data struct after unwrapping via PBC
  // adjacent = 0 if any box of ghost cells does not adjoin my owned cells

  recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"grid2d:recv");
  adjacent = 1;

  for (i = 0; i < nrecv_response; i++) {
    m = rresponse[i].index;
    recv[i].proc = overlap[m].proc;
    xlo = rresponse[i].box[0] + overlap[m].pbc[0] * nx;
    xhi = rresponse[i].box[1] + overlap[m].pbc[0] * nx;
    ylo = rresponse[i].box[2] + overlap[m].pbc[1] * ny;
    yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny;
    recv[i].nunpack = indices(recv[i].unpacklist,xlo,xhi,ylo,yhi);

    if (xlo != inxhi+1 && xhi != inxlo-1 &&
        ylo != inyhi+1 && yhi != inylo-1) adjacent = 0;
  }

  nrecv = nrecv_response;

  // create Copy data struct from overlaps with self
  // skip copy to self when non-PBC

  copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"grid2d:copy");

  ncopy = 0;
  for (m = 0; m < noverlap; m++) {
    if (overlap[m].proc != me) continue;
    if (overlap[m].pbc[0] == 0 && overlap[m].pbc[1] == 0) continue;
    xlo = overlap[m].box[0];
    xhi = overlap[m].box[1];
    ylo = overlap[m].box[2];
    yhi = overlap[m].box[3];
    copy[ncopy].npack = indices(copy[ncopy].packlist,xlo,xhi,ylo,yhi);
    xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx;
    xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx;
    ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny;
    yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny;
    copy[ncopy].nunpack = indices(copy[ncopy].unpacklist,xlo,xhi,ylo,yhi);
    ncopy++;
  }

  // set offsets for received data

  int offset = 0;
  for (m = 0; m < nsend; m++) {
    send[m].offset = offset;
    offset += send[m].npack;
  }

  offset = 0;
  for (m = 0; m < nrecv; m++) {
    recv[m].offset = offset;
    offset += recv[m].nunpack;
  }

  // length of MPI requests vector is max of nsend, nrecv

  int nrequest = MAX(nsend,nrecv);
  requests = new MPI_Request[nrequest];

  // clean-up

  clean_overlap();
  memory->destroy(proclist);
  memory->sfree(srequest);
  memory->sfree(rrequest);
  memory->sfree(sresponse);
  memory->sfree(rresponse);

  // nbuf1 = largest pack or unpack in any Send or Recv or Copy
  // nbuf2 = larget of sum of all packs or unpacks in Send or Recv

  nbuf1 = 0;

  for (m = 0; m < ncopy; m++) {
    nbuf1 = MAX(nbuf1,copy[m].npack);
    nbuf1 = MAX(nbuf1,copy[m].nunpack);
  }

  int nbufs = 0;
  for (m = 0; m < nsend; m++) {
    nbuf1 = MAX(nbuf1,send[m].npack);
    nbufs += send[m].npack;
  }

  int nbufr = 0;
  for (m = 0; m < nrecv; m++) {
    nbuf1 = MAX(nbuf1,recv[m].nunpack);
    nbufr += recv[m].nunpack;
  }

  nbuf2 = MAX(nbufs,nbufr);
}

// ----------------------------------------------------------------------
// query locality of forwrd/reverse grid comm
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   check if all procs only need ghost info from adjacent procs
   return 1 if yes, 0 if no
------------------------------------------------------------------------- */

int Grid2d::ghost_adjacent()
{
  if (comm->layout != Comm::LAYOUT_TILED) return ghost_adjacent_brick();
  return ghost_adjacent_tiled();
}

/* ----------------------------------------------------------------------
   adjacent = 0 if a proc's ghost xy lohi values exceed its subdomain size
   return 0 if adjacent=0 for any proc, else 1
------------------------------------------------------------------------- */

int Grid2d::ghost_adjacent_brick()
{
  adjacent = 1;
  if (ghostxlo > inxhi-inxlo+1) adjacent = 0;
  if (ghostxhi > inxhi-inxlo+1) adjacent = 0;
  if (ghostylo > inyhi-inylo+1) adjacent = 0;
  if (ghostyhi > inyhi-inylo+1) adjacent = 0;

  int adjacent_all;
  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
  return adjacent_all;
}

/* ----------------------------------------------------------------------
   adjacent = 0 if a proc's received ghosts were flagged
     as non-adjacent in setup_tiled()
   return 0 if adjacent=0 for any proc, else 1
------------------------------------------------------------------------- */

int Grid2d::ghost_adjacent_tiled()
{
  int adjacent_all;
  MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm);
  return adjacent_all;
}

// ----------------------------------------------------------------------
// forward/reverse comm of owned/ghost grid data via callbacks
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   forward comm of my owned cells to other's ghost cells
------------------------------------------------------------------------- */

void Grid2d::forward_comm(int caller, void *ptr, int which, int nper, int nbyte,
                          void *buf1, void *buf2, MPI_Datatype datatype)
{
  if (comm->layout != Comm::LAYOUT_TILED) {
    if (caller == KSPACE)
      forward_comm_brick<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
    else if (caller == PAIR)
      forward_comm_brick<Pair>((Pair *) ptr,which,nper,nbyte,
                               buf1,buf2,datatype);
    else if (caller == FIX)
      forward_comm_brick<Fix>((Fix *) ptr,which,nper,nbyte,
                              buf1,buf2,datatype);
  } else {
    if (caller == KSPACE)
      forward_comm_tiled<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
    else if (caller == PAIR)
      forward_comm_tiled<Pair>((Pair *) ptr,which,nper,nbyte,
                               buf1,buf2,datatype);
    else if (caller == FIX)
      forward_comm_tiled<Fix>((Fix *) ptr,which,nper,nbyte,
                              buf1,buf2,datatype);
  }
}

/* ----------------------------------------------------------------------
   forward comm for brick decomp via list of swaps with 6 neighbor procs
------------------------------------------------------------------------- */

template < class T >
void Grid2d::
forward_comm_brick(T *ptr, int which, int nper, int /*nbyte*/,
                   void *buf1, void *buf2, MPI_Datatype datatype)
{
  int m;
  MPI_Request request;

  for (m = 0; m < nswap; m++) {
    if (swap[m].sendproc == me)
      ptr->pack_forward_grid(which,buf2,swap[m].npack,swap[m].packlist);
    else
      ptr->pack_forward_grid(which,buf1,swap[m].npack,swap[m].packlist);

    if (swap[m].sendproc != me) {
      if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype,
                                     swap[m].recvproc,0,gridcomm,&request);
      if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype,
                                  swap[m].sendproc,0,gridcomm);
      if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE);
    }

    ptr->unpack_forward_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
  }
}

/* ----------------------------------------------------------------------
   forward comm for tiled decomp via Send/Recv lists of each neighbor proc
------------------------------------------------------------------------- */

template < class T >
void Grid2d::
forward_comm_tiled(T *ptr, int which, int nper, int nbyte,
                   void *buf1, void *vbuf2, MPI_Datatype datatype)
{
  int i,m,offset;

  auto buf2 = (char *) vbuf2;

  // post all receives

  for (m = 0; m < nrecv; m++) {
    offset = nper * recv[m].offset * nbyte;
    MPI_Irecv((void *) &buf2[offset],nper*recv[m].nunpack,datatype,
              recv[m].proc,0,gridcomm,&requests[m]);
  }

  // perform all sends to other procs

  for (m = 0; m < nsend; m++) {
    ptr->pack_forward_grid(which,buf1,send[m].npack,send[m].packlist);
    MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm);
  }

  // perform all copies to self

  for (m = 0; m < ncopy; m++) {
    ptr->pack_forward_grid(which,buf1,copy[m].npack,copy[m].packlist);
    ptr->unpack_forward_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
  }

  // unpack all received data

  for (i = 0; i < nrecv; i++) {
    MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE);
    offset = nper * recv[m].offset * nbyte;
    ptr->unpack_forward_grid(which,(void *) &buf2[offset],
                             recv[m].nunpack,recv[m].unpacklist);
  }
}

/* ----------------------------------------------------------------------
   reverse comm of my ghost cells to sum to owner cells
------------------------------------------------------------------------- */

void Grid2d::reverse_comm(int caller, void *ptr, int which, int nper, int nbyte,
                          void *buf1, void *buf2, MPI_Datatype datatype)
{
  if (comm->layout != Comm::LAYOUT_TILED) {
    if (caller == KSPACE)
      reverse_comm_brick<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
    else if (caller == PAIR)
      reverse_comm_brick<Pair>((Pair *) ptr,which,nper,nbyte,
                               buf1,buf2,datatype);
    else if (caller == FIX)
      reverse_comm_brick<Fix>((Fix *) ptr,which,nper,nbyte,
                              buf1,buf2,datatype);
  } else {
    if (caller == KSPACE)
      reverse_comm_tiled<KSpace>((KSpace *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
    else if (caller == PAIR)
      reverse_comm_tiled<Pair>((Pair *) ptr,which,nper,nbyte,
                                 buf1,buf2,datatype);
    else if (caller == FIX)
      reverse_comm_tiled<Fix>((Fix *) ptr,which,nper,nbyte,
                              buf1,buf2,datatype);
  }
}

/* ----------------------------------------------------------------------
   reverse comm for brick decomp via list of swaps with 6 neighbor procs
------------------------------------------------------------------------- */

template < class T >
void Grid2d::
reverse_comm_brick(T *ptr, int which, int nper, int /*nbyte*/,
                   void *buf1, void *buf2, MPI_Datatype datatype)
{
  int m;
  MPI_Request request;

  for (m = nswap-1; m >= 0; m--) {
    if (swap[m].recvproc == me)
      ptr->pack_reverse_grid(which,buf2,swap[m].nunpack,swap[m].unpacklist);
    else
      ptr->pack_reverse_grid(which,buf1,swap[m].nunpack,swap[m].unpacklist);

    if (swap[m].recvproc != me) {
      if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype,
                                   swap[m].sendproc,0,gridcomm,&request);
      if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype,
                                     swap[m].recvproc,0,gridcomm);
      if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE);
    }

    ptr->unpack_reverse_grid(which,buf2,swap[m].npack,swap[m].packlist);
  }
}

/* ----------------------------------------------------------------------
   reverse comm for tiled decomp via Send/Recv lists of each neighbor proc
------------------------------------------------------------------------- */

template < class T >
void Grid2d::
reverse_comm_tiled(T *ptr, int which, int nper, int nbyte,
                   void *buf1, void *vbuf2, MPI_Datatype datatype)
{
  int i,m,offset;

  auto buf2 = (char *) vbuf2;

  // post all receives

  for (m = 0; m < nsend; m++) {
    offset = nper * send[m].offset * nbyte;
    MPI_Irecv((void *) &buf2[offset],nper*send[m].npack,datatype,
              send[m].proc,0,gridcomm,&requests[m]);
  }

  // perform all sends to other procs

  for (m = 0; m < nrecv; m++) {
    ptr->pack_reverse_grid(which,buf1,recv[m].nunpack,recv[m].unpacklist);
    MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm);
  }

  // perform all copies to self

  for (m = 0; m < ncopy; m++) {
    ptr->pack_reverse_grid(which,buf1,copy[m].nunpack,copy[m].unpacklist);
    ptr->unpack_reverse_grid(which,buf1,copy[m].npack,copy[m].packlist);
  }

  // unpack all received data

  for (i = 0; i < nsend; i++) {
    MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE);
    offset = nper * send[m].offset * nbyte;
    ptr->unpack_reverse_grid(which,(void *) &buf2[offset],
                             send[m].npack,send[m].packlist);
  }
}

// ----------------------------------------------------------------------
// remap comm between 2 old/new grid decomposition of owned grid data
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   setup remap from old grid decomposition to this grid decomposition
   return sizes of two buffers needed for communication
   either for brick decomp or tiling decomp
   nbuf1 = largest pack or unpack in any Send or Recv or Copy
   nbuf2 = larget of sum of all packs or unpacks in Send or Recv
   for brick comm, nbuf1 = nbuf2
   for tiled comm, nbuf2 >= nbuf2
   nbuf1,nbuf2 are just count of grid points
     caller converts them to message size for grid data it stores
------------------------------------------------------------------------- */

void Grid2d::setup_remap(Grid2d *old, int &nremap_buf1, int &nremap_buf2)
{
  int m;
  int pbc[2];
  int *box;

  // deallocated existing remap data structs

  deallocate_remap();

  // overlaps of my old decomp owned box with all owned boxes in new decomp
  // noverlap_old = # of overlaps, including self
  // overlap_old = vector of overlap info in Overlap data struct

  int oldbox[4];
  old->get_bounds_owned(oldbox[0],oldbox[1],oldbox[2],oldbox[3]);
  pbc[0] = pbc[1] = 0;

  Overlap *overlap_old;
  int noverlap_old = compute_overlap(0,oldbox,pbc,overlap_old);

  // use overlap_old to construct send and copy lists
  // skip overlaps that contain no grid cells

  self_remap = 0;

  nsend_remap = 0;
  for (m = 0; m < noverlap_old; m++) {
    box = overlap_old[m].box;
    if (box[0] > box[1] || box[2] > box[3]) continue;
    if (overlap_old[m].proc == me) self_remap =1;
    else nsend_remap++;
  }

  send_remap = new Send[nsend_remap];

  nsend_remap = 0;
  for (m = 0; m < noverlap_old; m++) {
    box = overlap_old[m].box;
    if (box[0] > box[1] || box[2] > box[3]) continue;
    if (overlap_old[m].proc == me) {
      copy_remap.npack =
        old->indices(copy_remap.packlist,box[0],box[1],box[2],box[3]);
    } else {
      send_remap[nsend_remap].proc = overlap_old[m].proc;
      send_remap[nsend_remap].npack =
        old->indices(send_remap[nsend_remap].packlist,
                     box[0],box[1],box[2],box[3]);
      nsend_remap++;
    }
  }

  // overlaps of my new decomp owned box with all owned boxes in old decomp
  // noverlap_new = # of overlaps, including self
  // overlap_new = vector of overlap info in Overlap data struct

  int newbox[4];
  get_bounds_owned(newbox[0],newbox[1],newbox[2],newbox[3]);
  pbc[0] = pbc[1] = 0;

  Overlap *overlap_new;
  int noverlap_new = old->compute_overlap(0,newbox,pbc,overlap_new);

  // use overlap_new to construct recv and copy lists
  // skip overlaps that contain no grid cells
  // set offsets for Recv data

  nrecv_remap = 0;
  for (m = 0; m < noverlap_new; m++) {
    box = overlap_new[m].box;
    if (box[0] > box[1] || box[2] > box[3]) continue;
    if (overlap_new[m].proc != me) nrecv_remap++;
  }

  recv_remap = new Recv[nrecv_remap];

  nrecv_remap = 0;
  for (m = 0; m < noverlap_new; m++) {
    box = overlap_new[m].box;
    if (box[0] > box[1] || box[2] > box[3]) continue;
    if (overlap_new[m].proc == me) {
      copy_remap.nunpack =
        indices(copy_remap.unpacklist,box[0],box[1],box[2],box[3]);
    } else {
      recv_remap[nrecv_remap].proc = overlap_new[m].proc;
      recv_remap[nrecv_remap].nunpack =
        indices(recv_remap[nrecv_remap].unpacklist,
                box[0],box[1],box[2],box[3]);
      nrecv_remap++;
    }
  }

  // set offsets for received data

  int offset = 0;
  for (m = 0; m < nrecv_remap; m++) {
    recv_remap[m].offset = offset;
    offset += recv_remap[m].nunpack;
  }

  // length of MPI requests vector = nrecv_remap

  delete [] requests_remap;
  requests_remap = new MPI_Request[nrecv_remap];

  // clean-up

  clean_overlap();
  old->clean_overlap();

  // nremap_buf1 = largest pack or unpack in any Send or Recv or Copy
  // nremap_buf2 = sum of all unpacks in Recv

  nremap_buf1 = 0;

  if (self_remap) {
    nremap_buf1 = MAX(nremap_buf1,copy_remap.npack);
    nremap_buf1 = MAX(nremap_buf1,copy_remap.nunpack);
  }

  for (m = 0; m < nsend_remap; m++)
    nremap_buf1 = MAX(nremap_buf1,send_remap[m].npack);

  nremap_buf2 = 0;
  for (m = 0; m < nrecv_remap; m++) {
    nremap_buf1 = MAX(nremap_buf1,recv_remap[m].nunpack);
    nremap_buf2 += recv_remap[m].nunpack;
  }
}

/* ----------------------------------------------------------------------
   perform remap from old grid decomposition to this grid decomposition
   pack/unpack operations are performed by caller via callbacks
------------------------------------------------------------------------- */

void Grid2d::remap(int caller, void *ptr, int which, int nper, int nbyte,
                   void *buf1, void *buf2, MPI_Datatype datatype)
{
  if (caller == FIX) remap_style<Fix>((Fix *) ptr,which,nper,nbyte,buf1,buf2,datatype);
}

/* ------------------------------------------------------------------------- */

template < class T >
void Grid2d::remap_style(T *ptr, int which, int nper, int nbyte,
                         void *buf1, void *vbuf2, MPI_Datatype datatype)
{
  int i,m,offset;

  auto buf2 = (char *) vbuf2;

  // post all receives

  for (m = 0; m < nrecv_remap; m++) {
    offset = nper * recv_remap[m].offset * nbyte;
    MPI_Irecv((void *) &buf2[offset],nper*recv_remap[m].nunpack,datatype,
              recv_remap[m].proc,0,gridcomm,&requests_remap[m]);
  }

  // perform all sends to other procs

  for (m = 0; m < nsend_remap; m++) {
    ptr->pack_remap_grid(which,buf1,send_remap[m].npack,send_remap[m].packlist);
    MPI_Send(buf1,nper*send_remap[m].npack,datatype,send_remap[m].proc,0,gridcomm);
  }

  // perform remap to self if defined

  if (self_remap) {
    ptr->pack_remap_grid(which,buf1,copy_remap.npack,copy_remap.packlist);
    ptr->unpack_remap_grid(which,buf1,copy_remap.nunpack,copy_remap.unpacklist);
  }

  // unpack all received data

  for (i = 0; i < nrecv_remap; i++) {
    MPI_Waitany(nrecv_remap,requests_remap,&m,MPI_STATUS_IGNORE);
    offset = nper * recv_remap[m].offset * nbyte;
    ptr->unpack_remap_grid(which,(void *) &buf2[offset],
                           recv_remap[m].nunpack,recv_remap[m].unpacklist);
  }
}

// ----------------------------------------------------------------------
// grid I/O methods
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   read grid values from a file
------------------------------------------------------------------------- */

void Grid2d::read_file(int caller, void *ptr, FILE *fp, int nchunk, int maxline)
{
  if (caller == FIX)
    read_file_style<Fix>((Fix *) ptr,fp,nchunk,maxline);
}

/* ----------------------------------------------------------------------
   proc 0 reads one chunk of lines at a time from file
   broadcast chunk buffer to other procs
   call back to caller so it can process the chunk of lines
   caller returns count of grid-value lines in chunk
------------------------------------------------------------------------- */

template < class T >
void Grid2d::read_file_style(T *ptr, FILE *fp, int nchunk, int maxline)
{
  auto buffer = new char[nchunk * maxline];
  bigint ntotal = (bigint) nx * ny;
  bigint nread = 0;

  while (nread < ntotal) {
    int mychunk = MIN(ntotal - nread, nchunk);
    int eof = utils::read_lines_from_file(fp, mychunk, maxline, buffer, me, world);
    if (eof) error->all(FLERR, "Unexpected end of grid data file");

    nread += ptr->unpack_read_grid(nchunk,buffer);
  }

  delete [] buffer;
}

/* ----------------------------------------------------------------------
   write grid values to a file
------------------------------------------------------------------------- */

void Grid2d::write_file(int caller, void *ptr, int which,
                        int nper, int nbyte, MPI_Datatype datatype)
{
  if (caller == FIX)
    write_file_style<Fix>((Fix *) ptr, which, nper, nbyte, datatype);
}

/* ----------------------------------------------------------------------
   proc 0 reads one chunk of lines at a time from file
   broadcast chunk buffer to other procs
   call back to caller so it can process the chunk of lines
   caller returns count of grid-value lines in chunk
------------------------------------------------------------------------- */

template < class T >
void Grid2d::write_file_style(T *ptr, int which,
                              int nper, int nbyte, MPI_Datatype datatype)
{
  // maxsize = max size of grid data owned by any proc

  int mysize = (inxhi-inxlo+1) * (inyhi-inylo+1);
  mysize *= nper;
  int maxsize;
  MPI_Allreduce(&mysize,&maxsize,1,MPI_INT,MPI_MAX,world);

  // pack my grid data via callback to caller

  char *onebuf;
  if (me == 0) memory->create(onebuf,maxsize*nbyte,"grid3d:onebuf");
  else memory->create(onebuf,mysize*nbyte,"grid3d:onebuf");
  ptr->pack_write_grid(which,onebuf);

  // ping each proc for its grid data
  // call back to caller with each proc's grid data

  int tmp;
  int bounds[4];

  if (me == 0) {
    MPI_Status status;
    MPI_Request request;

    for (int iproc = 0; iproc < nprocs; iproc++) {
      if (iproc) {
        MPI_Irecv(onebuf,maxsize,datatype,iproc,0,world,&request);
        MPI_Send(&tmp,0,MPI_INT,iproc,0,world);
        MPI_Wait(&request,&status);
        MPI_Recv(bounds,4,MPI_INT,iproc,0,world,&status);
      } else {
        bounds[0] = inxlo;
        bounds[1] = inxhi;
        bounds[2] = inylo;
        bounds[3] = inyhi;
      }

      ptr->unpack_write_grid(which,onebuf,bounds);
    }

  } else {
    MPI_Recv(&tmp,0,MPI_INT,0,0,world,MPI_STATUS_IGNORE);
    MPI_Rsend(onebuf,mysize,datatype,0,0,world);
    bounds[0] = inxlo;
    bounds[1] = inxhi;
    bounds[2] = inylo;
    bounds[3] = inyhi;
    MPI_Send(bounds,4,MPI_INT,0,0,world);
  }

  // clean up

  memory->destroy(onebuf);
}

// ----------------------------------------------------------------------
// overlap methods for brick and tiled RCB decompositions
// overlap = overlap of owned or owned+ghost box with all boxes of a decomposition
// for owned/ghost grid comm, called only by tiled decomposition
//   brick decomp uses one or more comm passes with neigh procs
//   like forward/reverse comm for atoms
// for remap, called by both brick and tiled decompositions
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   compute list of overlaps between box and the owned grid boxes of all procs
   ghostflag = 1 if box includes ghost grid pts, called by setup_tiled()
   ghostflag = 0 if box has no ghost grid pts, called by setup_remap()
   layout != LAYOUT_TILED is only invoked by setup_remap()
   for brick decomp of Grid, done using xyz split + grid2proc copied from Comm
   for tiled decomp of Grid, done via recursive box drop on RCB tree
   box = 4 integers = (xlo,xhi,ylo,yhi)
     box can be owned cells or owned + ghost cells
   pbc = flags for grid periodicity in each dim
     if box includes ghost cells, it can overlap PBCs (only for setup_tiled)
     each lo/hi value may extend beyond 0 to N-1 into another periodic image
   return # of overlaps including with self, caller handles self overlaps as needed
   return list of overlaps
     for setup_tiled() this is what box_drop() computes
       entire box for each overlap
       caller will determine extent of overlap using PBC info
     for setup_remap(), return extent of overlap (no PBC info involved)
       use proc_box_uniform() or tiled() and MAX/MIN to determine this
------------------------------------------------------------------------- */

int Grid2d::compute_overlap(int ghostflag, int *box, int *pbc, Overlap *&overlap)
{
  int obox[4];

  memory->create(overlap_procs,nprocs,"grid2d:overlap_procs");
  noverlap_list = maxoverlap_list = 0;
  overlap_list = nullptr;

  // skip overlap check if box contains no grid cells

  if (box[0] > box[1] || box[2] > box[3]) {
    overlap = overlap_list;
    return noverlap_list;
  }

  // test obox against appropriate layout

  if (comm->layout != Comm::LAYOUT_TILED) {

    // find comm->procgrid indices in each dim for box bounds

    int iproclo = proc_index_uniform(box[0],nx,shift_grid,0,xsplit);
    int iprochi = proc_index_uniform(box[1],nx,shift_grid,0,xsplit);
    int jproclo = proc_index_uniform(box[2],ny,shift_grid,1,ysplit);
    int jprochi = proc_index_uniform(box[3],ny,shift_grid,1,ysplit);

    // compute extent of overlap of box with with each proc's obox

    for (int j = jproclo; j <= jprochi; j++)
      for (int i = iproclo; i <= iprochi; i++) {
        partition_grid(nx,xsplit[i],xsplit[i+1],shift_grid,0,obox[0],obox[1]);
        partition_grid(ny,ysplit[j],ysplit[j+1],shift_grid,yextra,obox[2],obox[3]);

        if (noverlap_list == maxoverlap_list) grow_overlap();
        overlap_list[noverlap_list].proc = grid2proc[i][j][0];
        overlap_list[noverlap_list].box[0] = MAX(box[0],obox[0]);
        overlap_list[noverlap_list].box[1] = MIN(box[1],obox[1]);
        overlap_list[noverlap_list].box[2] = MAX(box[2],obox[2]);
        overlap_list[noverlap_list].box[3] = MIN(box[3],obox[3]);
        noverlap_list++;
      }

  } else {
    box_drop(box,pbc);

    // compute extent of overlap of box with with each proc's obox

    if (ghostflag == 0) {
      for (int m = 0; m < noverlap_list; m++) {
        obox[0] = 0;
        obox[1] = nx-1;
        obox[2] = 0;
        obox[3] = ny-1;

        partition_tiled(overlap_list[m].proc,0,nprocs-1,obox);

        overlap_list[m].box[0] = MAX(box[0],obox[0]);
        overlap_list[m].box[1] = MIN(box[1],obox[1]);
        overlap_list[m].box[2] = MAX(box[2],obox[2]);
        overlap_list[m].box[3] = MIN(box[3],obox[3]);
      }
    }
  }

  overlap = overlap_list;
  return noverlap_list;
}

/* ----------------------------------------------------------------------
   deallocate data created by recursive overlap computation
------------------------------------------------------------------------- */

void Grid2d::clean_overlap()
{
  memory->destroy(overlap_procs);
  memory->sfree(overlap_list);
}

/* ----------------------------------------------------------------------
   recursively split a box until it doesn't overlap any periodic boundaries
   box = 4 integers = (xlo,xhi,ylo,yhi)
     each lo/hi value may extend beyonw 0 to N-1 into another periodic image
   pbc = flags in each dim of which periodic image the caller box was in
   when a box straddles a periodic bounadry, split it in two
   when a box does not straddle, drop it down RCB tree
     add all the procs it overlaps with to Overlap list
------------------------------------------------------------------------- */

void Grid2d::box_drop(int *box, int *pbc)
{
  int i,m;

  // newbox12 and newpbc are initially copies of caller box and pbc

  int newbox1[4],newbox2[4],newpbc[2];

  for (i = 0; i < 4; i++) newbox1[i] = newbox2[i] = box[i];
  for (i = 0; i < 2; i++) newpbc[i] = pbc[i];

  // 4 if tests to see if box needs to be split across a periodic boundary
  // newbox1 and 2 = new split boxes, newpbc increments current pbc
  // final else is no split

  int splitflag = 1;

  if (box[0] < 0) {
    newbox1[0] = 0;
    newbox2[0] = box[0] + nx;
    newbox2[1] = nx - 1;
    newpbc[0]--;
  } else if (box[1] >= nx) {
    newbox1[1] = nx - 1;
    newbox2[0] = 0;
    newbox2[1] = box[1] - nx;
    newpbc[0]++;
  } else if (box[2] < 0) {
    newbox1[2] = 0;
    newbox2[2] = box[2] + ny;
    newbox2[3] = ny - 1;
    newpbc[1]--;
  } else if (box[3] >= ny) {
    newbox1[3] = ny - 1;
    newbox2[2] = 0;
    newbox2[3] = box[3] - ny;
    newpbc[1]++;

  // box is not split, drop on RCB tree
  // returns np = # of procs it overlaps, including self
  // returns proc_overlap = list of proc IDs it overlaps
  // add each overlap to overlap list

  } else {
    splitflag = 0;
    int np = 0;
    box_drop_grid(box,0,nprocs-1,np,overlap_procs);
    for (m = 0; m < np; m++) {
      if (noverlap_list == maxoverlap_list) grow_overlap();
      overlap_list[noverlap_list].proc = overlap_procs[m];
      for (i = 0; i < 4; i++) overlap_list[noverlap_list].box[i] = box[i];
      for (i = 0; i < 2; i++) overlap_list[noverlap_list].pbc[i] = pbc[i];
      noverlap_list++;
    }
  }

  // recurse with 2 split boxes

  if (splitflag) {
    box_drop(newbox1,pbc);
    box_drop(newbox2,newpbc);
  }
}

/* ----------------------------------------------------------------------
   recursively drop a box down the RCB tree to find all procs it overlaps with
   box = 4 integers = (xlo,xhi,ylo,yhi)
     each lo/hi value ranges from 0 to N-1 in a dim, N = grid size in that dim
     box is guaranteed to be wholly within the global domain
   return Np = # of procs, plist = proc IDs
------------------------------------------------------------------------- */

void Grid2d::box_drop_grid(int *box, int proclower, int procupper,
                           int &np, int *plist)
{
  // end recursion when partition is a single proc
  // add proclower to plist

  if (proclower == procupper) {
    plist[np++] = proclower;
    return;
  }

  // drop box on each side of cut it extends beyond
  // use < and >= criteria so does not include a box it only touches
  // procmid = 1st processor in upper half of partition
  //         = location in tree that stores this cut
  // cut = index of first grid cell in upper partition
  // dim = 0,1,2 dimension of cut

  int procmid = proclower + (procupper - proclower) / 2 + 1;
  int dim = rcbinfo[procmid].dim;
  int cut = rcbinfo[procmid].cut;

  if (box[2*dim] < cut) box_drop_grid(box,proclower,procmid-1,np,plist);
  if (box[2*dim+1] >= cut) box_drop_grid(box,procmid,procupper,np,plist);
}

// ----------------------------------------------------------------------
// miscellaneous methods
// ----------------------------------------------------------------------

/* ----------------------------------------------------------------------
   grow list of swaps by DELTA
------------------------------------------------------------------------- */

void Grid2d::grow_swap()
{
  maxswap += DELTA;
  swap = (Swap *) memory->srealloc(swap,maxswap*sizeof(Swap),"grid2d:swap");
}

/* ----------------------------------------------------------------------
   grow list of overlaps by DELTA
------------------------------------------------------------------------- */

void Grid2d::grow_overlap()
{
  maxoverlap_list += DELTA;
  overlap_list = (Overlap *)
    memory->srealloc(overlap_list,maxoverlap_list*sizeof(Overlap),"grid2d:overlap");
}

/* ----------------------------------------------------------------------
   deallocate remap data structs
------------------------------------------------------------------------- */

void Grid2d::deallocate_remap()
{
  for (int i = 0; i < nsend_remap; i++)
    memory->destroy(send_remap[i].packlist);
  delete [] send_remap;

  for (int i = 0; i < nrecv_remap; i++)
    memory->destroy(recv_remap[i].unpacklist);
  delete [] recv_remap;

  if (self_remap) {
    memory->destroy(copy_remap.packlist);
    memory->destroy(copy_remap.unpacklist);
  }
}

/* ----------------------------------------------------------------------
   create 1d list of offsets into 2d array section (xlo:xhi,ylo:yhi)
   assume caller's 2d array is allocated as
     (fullxlo:fullxhi,fullylo:fullyhi)
------------------------------------------------------------------------- */

int Grid2d::indices(int *&list, int xlo, int xhi, int ylo, int yhi)
{
  int nmax = (xhi-xlo+1) * (yhi-ylo+1);
  memory->create(list,nmax,"grid2d:indices");
  if (nmax == 0) return 0;

  int nx = (fullxhi-fullxlo+1);

  int n = 0;
  int ix,iy;
  for (iy = ylo; iy <= yhi; iy++)
    for (ix = xlo; ix <= xhi; ix++)
      list[n++] = (iy-fullylo)*nx + (ix-fullxlo);

  return nmax;
}

/* ----------------------------------------------------------------------
   find the comm->procgrid index = which proc owns the igrid index
   igrid = grid index (0 to N-1) in dim
   n = # of grid points in dim
   shift determines position of grid pt within grid cell
     shift = 0.5 for cell center, 0.0 for lower-left corner
   dim = which dimension (0,1)
   split = comm->x/y/z split for fractional bounds of each proc domain
------------------------------------------------------------------------- */

int Grid2d::proc_index_uniform(int igrid, int n, double shift, int dim, double *split)
{
  int lo,hi;
  double fraclo,frachi;

  // loop over # of procs in this dime
  // compute the grid bounds for that proc
  // if igrid falls within those bounds, return m = proc index
  // same logic as in partition_grid()

  int m;
  for (m = 0; m < comm->procgrid[dim]; m++) {
    fraclo = split[m];
    frachi = split[m+1];

    lo = static_cast<int> (fraclo * n);
    while (lo+shift < fraclo*n) lo++;
    hi = static_cast<int> (frachi * n);
    if (hi+shift >= frachi*n) hi--;

    if (igrid >= lo && igrid <= hi) break;
  }

  return m;
}

/* ----------------------------------------------------------------------
   compute the grid box owned by proc within tiled decomposition
   performed recursively until proclower = procupper = proc
   return box = lo/hi bounds of proc's box in 2 dims
------------------------------------------------------------------------- */

void Grid2d::partition_tiled(int proc, int proclower, int procupper, int *box)
{
  // end recursion when partition is a single proc

  if (proclower == procupper) return;

  // split processor partition
  // procmid = 1st processor in upper half of partition
  //         = location in tree that stores this cut
  // cut = index of first grid cell in upper partition
  // dim = 0,1 dimension of cut

  int procmid = proclower + (procupper - proclower) / 2 + 1;
  int dim = rcbinfo[procmid].dim;
  int cut = rcbinfo[procmid].cut;

  // adjust box to reflect which half of partition the proc is in

  if (proc < procmid) {
    box[2*dim+1] = cut-1;
    partition_tiled(proc,proclower,procmid-1,box);
  } else {
    box[2*dim] = cut;
    partition_tiled(proc,procmid,procupper,box);
  }
}