git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12207 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2014-07-28 19:58:51 +00:00
parent 3cc23720e9
commit 4455b66003
23 changed files with 2418 additions and 700 deletions
--- a/src/USER-CUDA/pppm_cuda.cpp
+++ b/src/USER-CUDA/pppm_cuda.cpp
@ -211,6 +211,9 @@ void PPPMCuda::init()
  // error check
  if (domain->dimension == 2) error->all(FLERR,"Cannot use PPPMCuda with 2d simulation");
  if (comm->style != 0) 
    error->universe_all(FLERR,"PPPMCuda can only currently be used with "
                        "comm_style brick");
  if (!atom->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
--- a/src/USER-LB/fix_lb_fluid.cpp
+++ b/src/USER-LB/fix_lb_fluid.cpp
@ -88,6 +88,10 @@ FixLbFluid::FixLbFluid(LAMMPS *lmp, int narg, char **arg) :
  if(narg <7) error->all(FLERR,"Illegal fix lb/fluid command");
  if (comm->style != 0) 
    error->universe_all(FLERR,"Fix lb/fluid can only currently be used with "
                        "comm_style brick");
  MPI_Comm_rank(world,&me);
  MPI_Comm_size(world,&nprocs);
@ -567,9 +571,12 @@ int FixLbFluid::setmask()
 void FixLbFluid::init(void)
 {
  int i,j;
  if (comm->style != 0) 
    error->universe_all(FLERR,"Fix lb/fluid can only currently be used with "
                        "comm_style brick");
  //--------------------------------------------------------------------------
  // Check to see if the MD timestep has changed between runs.
  //--------------------------------------------------------------------------
--- a/src/atom.cpp
+++ b/src/atom.cpp
@ -47,6 +47,8 @@ using namespace MathConst;
 #define CUDA_CHUNK 3000
 #define MAXBODY 20       // max # of lines in one body, also in ReadData class
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 /* ---------------------------------------------------------------------- */
 Atom::Atom(LAMMPS *lmp) : Pointers(lmp)
@ -753,17 +755,33 @@ void Atom::data_atoms(int n, char *buf)
    sublo[2] = domain->sublo_lamda[2]; subhi[2] = domain->subhi_lamda[2];
  }
-  if (domain->xperiodic) {
+  if (comm->layout != LAYOUT_TILED) {
-    if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
+    if (domain->xperiodic) {
-    if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
+      if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
-  }
+      if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
-  if (domain->yperiodic) {
+    }
-    if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
+    if (domain->yperiodic) {
-    if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
+      if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
-  }
+      if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
-  if (domain->zperiodic) {
+    }
-    if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
+    if (domain->zperiodic) {
-    if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
+      if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
      if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
    }
  } else {
    if (domain->xperiodic) {
      if (comm->mysplit[0][0] == 0.0) sublo[0] -= epsilon[0];
      if (comm->mysplit[0][1] == 1.0) subhi[0] += epsilon[0];
    }
    if (domain->yperiodic) {
      if (comm->mysplit[1][0] == 0.0) sublo[1] -= epsilon[1];
      if (comm->mysplit[1][1] == 1.0) subhi[1] += epsilon[1];
    }
    if (domain->zperiodic) {
      if (comm->mysplit[2][0] == 0.0) sublo[2] -= epsilon[2];
      if (comm->mysplit[2][1] == 1.0) subhi[2] += epsilon[2];
    }
  }
  // xptr = which word in line starts xyz coords
--- a/src/balance.cpp
+++ b/src/balance.cpp
@ -21,6 +21,7 @@
 #include "balance.h"
 #include "atom.h"
 #include "comm.h"
 #include "rcb.h"
 #include "irregular.h"
 #include "domain.h"
 #include "force.h"
@ -30,9 +31,10 @@
 using namespace LAMMPS_NS;
-enum{XYZ,SHIFT,RCB};
+enum{XYZ,SHIFT,BISECTION};
 enum{NONE,UNIFORM,USER};
 enum{X,Y,Z};
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 /* ---------------------------------------------------------------------- */
@ -47,6 +49,8 @@ Balance::Balance(LAMMPS *lmp) : Pointers(lmp)
  user_xsplit = user_ysplit = user_zsplit = NULL;
  shift_allocate = 0;
  rcb = NULL;
  fp = NULL;
  firststep = 1;
 }
@ -74,6 +78,8 @@ Balance::~Balance()
    delete [] hisum;
  }
  delete rcb;
  if (fp) fclose(fp);
 }
@ -176,7 +182,7 @@ void Balance::command(int narg, char **arg)
    } else if (strcmp(arg[iarg],"rcb") == 0) {
      if (style != -1) error->all(FLERR,"Illegal balance command");
-      style = RCB;
+      style = BISECTION;
      iarg++;
    } else break;
@ -232,6 +238,9 @@ void Balance::command(int narg, char **arg)
    }
  }
  if (style == BISECTION && comm->style == 0) 
    error->all(FLERR,"Balance rcb cannot be used with comm_style brick");
  // insure atoms are in current box & update box via shrink-wrap
  // init entire system since comm->setup is done
  // comm::init needs neighbor::init needs pair::init needs kspace::init, etc
@ -251,8 +260,10 @@ void Balance::command(int narg, char **arg)
  double imbinit = imbalance_nlocal(maxinit);
  // no load-balance if imbalance doesn't exceed threshhold
  // unless switching from tiled to non tiled layout, then force rebalance
-  if (imbinit < thresh) return;
+  if (comm->layout == LAYOUT_TILED && style != BISECTION) {
  } else if (imbinit < thresh) return;
  // debug output of initial state
@ -262,80 +273,65 @@ void Balance::command(int narg, char **arg)
  int niter = 0;
  // NOTE: if using XYZ or SHIFT and current partition is TILING,
  //   then need to create initial BRICK partition before performing LB
  // perform load-balance
  // style XYZ = explicit setting of cutting planes of logical 3d grid
  if (style == XYZ) {
    if (comm->layout == LAYOUT_UNIFORM) {
      if (xflag == USER || yflag == USER || zflag == USER)
        comm->layout == LAYOUT_NONUNIFORM;
    } else if (comm->style == LAYOUT_NONUNIFORM) {
      if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
        comm->layout == LAYOUT_UNIFORM;
    } else if (comm->style == LAYOUT_TILED) {
      if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
        comm->layout == LAYOUT_UNIFORM;
      else comm->layout == LAYOUT_NONUNIFORM;
    }
    if (xflag == UNIFORM) {
      for (int i = 0; i < procgrid[0]; i++)
        comm->xsplit[i] = i * 1.0/procgrid[0];
      comm->xsplit[procgrid[0]] = 1.0;
-    }
+    } else if (xflag == USER)
      for (int i = 0; i <= procgrid[0]; i++) comm->xsplit[i] = user_xsplit[i];
    if (yflag == UNIFORM) {
      for (int i = 0; i < procgrid[1]; i++)
        comm->ysplit[i] = i * 1.0/procgrid[1];
      comm->ysplit[procgrid[1]] = 1.0;
-    }
+    } else if (yflag == USER)
      for (int i = 0; i <= procgrid[1]; i++) comm->ysplit[i] = user_ysplit[i];
    if (zflag == UNIFORM) {
      for (int i = 0; i < procgrid[2]; i++)
        comm->zsplit[i] = i * 1.0/procgrid[2];
      comm->zsplit[procgrid[2]] = 1.0;
-    }
+    } else if (zflag == USER)
    if (xflag == USER)
      for (int i = 0; i <= procgrid[0]; i++) comm->xsplit[i] = user_xsplit[i];
    if (yflag == USER)
      for (int i = 0; i <= procgrid[1]; i++) comm->ysplit[i] = user_ysplit[i];
    if (zflag == USER)
      for (int i = 0; i <= procgrid[2]; i++) comm->zsplit[i] = user_zsplit[i];
  }
  // style SHIFT = adjust cutting planes of logical 3d grid
  if (style == SHIFT) {
-    static_setup(bstr);
+    comm->layout = LAYOUT_NONUNIFORM;
    shift_setup_static(bstr);
    niter = shift();
  }
-  // style RCB = 
+  // style BISECTION = recursive coordinate bisectioning
-  if (style == RCB) {
+  if (style == BISECTION) {
-    error->all(FLERR,"Balance rcb is not yet supported");
+    comm->layout = LAYOUT_TILED;
-
+    bisection(1);
    if (comm->style == 0) 
      error->all(FLERR,"Cannot use balance rcb with comm_style brick");
  }
  // output of final result
  if (outflag && me == 0) dumpout(update->ntimestep,fp);
  // reset comm->uniform flag if necessary
  if (comm->uniform) {
    if (style == SHIFT) comm->uniform = 0;
    if (style == XYZ && xflag == USER) comm->uniform = 0;
    if (style == XYZ && yflag == USER) comm->uniform = 0;
    if (style == XYZ && zflag == USER) comm->uniform = 0;
  } else {
    if (dimension == 3) {
      if (style == XYZ && 
          xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
        comm->uniform = 1;
    } else {
      if (style == XYZ && xflag == UNIFORM && yflag == UNIFORM)
        comm->uniform = 1;
    }
  }
  // reset proc sub-domains
  // for either brick or tiled comm style
  if (domain->triclinic) domain->set_lamda_box();
  domain->set_local_box();
@ -344,7 +340,8 @@ void Balance::command(int narg, char **arg)
  if (domain->triclinic) domain->x2lamda(atom->nlocal);
  Irregular *irregular = new Irregular(lmp);
-  irregular->migrate_atoms(1);
+  if (style == BISECTION) irregular->migrate_atoms(1,rcb->sendproc);
  else irregular->migrate_atoms(1);
  delete irregular;
  if (domain->triclinic) domain->lamda2x(atom->nlocal);
@ -382,34 +379,36 @@ void Balance::command(int narg, char **arg)
    }
  }
-  if (me == 0) {
+  if (style != BISECTION) {
-    if (screen) {
+    if (me == 0) {
-      fprintf(screen,"  x cuts:");
+      if (screen) {
-      for (int i = 0; i <= comm->procgrid[0]; i++)
+        fprintf(screen,"  x cuts:");
-        fprintf(screen," %g",comm->xsplit[i]);
+        for (int i = 0; i <= comm->procgrid[0]; i++)
-      fprintf(screen,"\n");
+          fprintf(screen," %g",comm->xsplit[i]);
-      fprintf(screen,"  y cuts:");
+        fprintf(screen,"\n");
-      for (int i = 0; i <= comm->procgrid[1]; i++)
+        fprintf(screen,"  y cuts:");
-        fprintf(screen," %g",comm->ysplit[i]);
+        for (int i = 0; i <= comm->procgrid[1]; i++)
-      fprintf(screen,"\n");
+          fprintf(screen," %g",comm->ysplit[i]);
-      fprintf(screen,"  z cuts:");
+        fprintf(screen,"\n");
-      for (int i = 0; i <= comm->procgrid[2]; i++)
+        fprintf(screen,"  z cuts:");
-        fprintf(screen," %g",comm->zsplit[i]);
+        for (int i = 0; i <= comm->procgrid[2]; i++)
-      fprintf(screen,"\n");
+          fprintf(screen," %g",comm->zsplit[i]);
-    }
+        fprintf(screen,"\n");
-    if (logfile) {
+      }
-      fprintf(logfile,"  x cuts:");
+      if (logfile) {
-      for (int i = 0; i <= comm->procgrid[0]; i++)
+        fprintf(logfile,"  x cuts:");
-        fprintf(logfile," %g",comm->xsplit[i]);
+        for (int i = 0; i <= comm->procgrid[0]; i++)
-      fprintf(logfile,"\n");
+          fprintf(logfile," %g",comm->xsplit[i]);
-      fprintf(logfile,"  y cuts:");
+        fprintf(logfile,"\n");
-      for (int i = 0; i <= comm->procgrid[1]; i++)
+        fprintf(logfile,"  y cuts:");
-        fprintf(logfile," %g",comm->ysplit[i]);
+        for (int i = 0; i <= comm->procgrid[1]; i++)
-      fprintf(logfile,"\n");
+          fprintf(logfile," %g",comm->ysplit[i]);
-      fprintf(logfile,"  z cuts:");
+        fprintf(logfile,"\n");
-      for (int i = 0; i <= comm->procgrid[2]; i++)
+        fprintf(logfile,"  z cuts:");
-        fprintf(logfile," %g",comm->zsplit[i]);
+        for (int i = 0; i <= comm->procgrid[2]; i++)
-      fprintf(logfile,"\n");
+          fprintf(logfile," %g",comm->zsplit[i]);
        fprintf(logfile,"\n");
      }
    }
  }
 }
@ -467,13 +466,52 @@ double Balance::imbalance_splits(int &max)
  return imbalance;
 }
 /* ----------------------------------------------------------------------
   perform balancing via RCB class
   sortflag = flag for sorting order of received messages by proc ID
 ------------------------------------------------------------------------- */
 int *Balance::bisection(int sortflag)
 {
  if (!rcb) rcb = new RCB(lmp);
  // NOTE: lo/hi args could be simulation box or particle bounding box
  // NOTE: triclinic needs to be in lamda coords
  int dim = domain->dimension;
  double *boxlo = domain->boxlo;
  double *boxhi = domain->boxhi;
  double *prd = domain->prd;
  rcb->compute(dim,atom->nlocal,atom->x,NULL,boxlo,boxhi);
  rcb->invert(sortflag);
  // NOTE: this logic is specific to orthogonal boxes, not triclinic
  double (*mysplit)[2] = comm->mysplit;
  mysplit[0][0] = (rcb->lo[0] - boxlo[0]) / prd[0];
  if (rcb->hi[0] == boxhi[0]) mysplit[0][1] = 1.0;
  else mysplit[0][1] = (rcb->hi[0] - boxlo[0]) / prd[0];
  mysplit[1][0] = (rcb->lo[1] - boxlo[1]) / prd[1];
  if (rcb->hi[1] == boxhi[1]) mysplit[1][1] = 1.0;
  else mysplit[1][1] = (rcb->hi[1] - boxlo[1]) / prd[1];
  mysplit[2][0] = (rcb->lo[2] - boxlo[2]) / prd[2];
  if (rcb->hi[2] == boxhi[2]) mysplit[2][1] = 1.0;
  else mysplit[2][1] = (rcb->hi[2] - boxlo[2]) / prd[2];
  return rcb->sendproc;
 }
 /* ----------------------------------------------------------------------
   setup static load balance operations
-   called from command
+   called from command and indirectly initially from fix balance
   set rho = 0 for static balancing
 ------------------------------------------------------------------------- */
-void Balance::static_setup(char *str)
+void Balance::shift_setup_static(char *str)
 {
  shift_allocate = 1;
@ -498,21 +536,35 @@ void Balance::static_setup(char *str)
  losum = new bigint[max+1];
  hisum = new bigint[max+1];
  // if current layout is TILED, set initial uniform splits in Comm
  // this gives starting point to subsequent shift balancing
  if (comm->layout == LAYOUT_TILED) {
    int *procgrid = comm->procgrid;
    double *xsplit = comm->xsplit;
    double *ysplit = comm->ysplit;
    double *zsplit = comm->zsplit;
    for (int i = 0; i < procgrid[0]; i++) xsplit[i] = i * 1.0/procgrid[0];
    for (int i = 0; i < procgrid[1]; i++) ysplit[i] = i * 1.0/procgrid[1];
    for (int i = 0; i < procgrid[2]; i++) zsplit[i] = i * 1.0/procgrid[2];
    xsplit[procgrid[0]] = ysplit[procgrid[1]] = zsplit[procgrid[2]] = 1.0;
  }
  rho = 0;
 }
 /* ----------------------------------------------------------------------
   setup shift load balance operations
   called from fix balance
-   set rho = 1 for shift balancing after call to shift_setup()
+   set rho = 1 to do dynamic balancing after call to shift_setup_static()
 ------------------------------------------------------------------------- */
 void Balance::shift_setup(char *str, int nitermax_in, double thresh_in)
 {
-  static_setup(str);
+  shift_setup_static(str);
  nitermax = nitermax_in;
  stopthresh = thresh_in;
  rho = 1;
 }
@ -525,7 +577,7 @@ void Balance::shift_setup(char *str, int nitermax_in, double thresh_in)
 int Balance::shift()
 {
  int i,j,k,m,np,max;
-  double *split = NULL;
+  double *split;
  // no balancing if no atoms
@ -590,7 +642,7 @@ int Balance::shift()
    // iterate until balanced
 #ifdef BALANCE_DEBUG
-    if (me == 0) debug_output(idim,0,np,split);
+    if (me == 0) debug_shift_output(idim,0,np,split);
 #endif
    int doneflag;
@ -601,7 +653,7 @@ int Balance::shift()
      niter++;
 #ifdef BALANCE_DEBUG
-      if (me == 0) debug_output(idim,m+1,np,split);
+      if (me == 0) debug_shift_output(idim,m+1,np,split);
      if (me == 0 && fp) dumpout(update->ntimestep,fp);
 #endif
@ -827,7 +879,7 @@ void Balance::dumpout(bigint tstep, FILE *bfp)
    int nx = comm->procgrid[0] + 1;
    int ny = comm->procgrid[1] + 1;
-    //int nz = comm->procgrid[2] + 1;
+    int nz = comm->procgrid[2] + 1;
    if (dimension == 2) {
      int m = 0;
@ -914,7 +966,7 @@ void Balance::dumpout(bigint tstep, FILE *bfp)
 ------------------------------------------------------------------------- */
 #ifdef BALANCE_DEBUG
-void Balance::debug_output(int idim, int m, int np, double *split)
+void Balance::debug_shift_output(int idim, int m, int np, double *split)
 {
  int i;
  const char *dim = NULL;
--- a/src/balance.h
+++ b/src/balance.h
@ -27,11 +27,14 @@ namespace LAMMPS_NS {
 class Balance : protected Pointers {
 public:
  class RCB *rcb;
  Balance(class LAMMPS *);
  ~Balance();
  void command(int, char **);
  void shift_setup(char *, int, double);
  int shift();
  int *bisection(int sortflag = 0);
  double imbalance_nlocal(int &);
  void dumpout(bigint, FILE *);
@ -66,13 +69,13 @@ class Balance : protected Pointers {
  FILE *fp;
  int firststep;
  void static_setup(char *);
  double imbalance_splits(int &);
  void shift_setup_static(char *);
  void tally(int, int, double *);
  int adjust(int, double *);
  int binary(double, int, double *);
 #ifdef BALANCE_DEBUG
-  void debug_output(int, int, int, double *);
+  void debug_shift_output(int, int, int, double *);
 #endif
 };
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -47,25 +47,95 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  gridflag = ONELEVEL;
  mapflag = CART;
  customfile = NULL;
  outfile = NULL;
  recv_from_partition = send_to_partition = -1;
  otherflag = 0;
-  outfile = NULL;
+  maxexchange_atom = maxexchange_fix = 0;
  grid2proc = NULL;
  xsplit = ysplit = zsplit = NULL;
  // use of OpenMP threads
  // query OpenMP for number of threads/process set by user at run-time
  // if the OMP_NUM_THREADS environment variable is not set, we default
  // to using 1 thread. This follows the principle of the least surprise,
  // while practically all OpenMP implementations violate it by using
  // as many threads as there are (virtual) CPU cores by default.
  nthreads = 1;
 #ifdef _OPENMP
  if (lmp->kokkos) {
    nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
  } else if (getenv("OMP_NUM_THREADS") == NULL) {
    nthreads = 1;
    if (me == 0)
      error->warning(FLERR,"OMP_NUM_THREADS environment is not set.");
  } else {
    nthreads = omp_get_max_threads();
  }
  // enforce consistent number of threads across all MPI tasks
  MPI_Bcast(&nthreads,1,MPI_INT,0,world);
  if (!lmp->kokkos) omp_set_num_threads(nthreads);
  if (me == 0) {
    if (screen)
      fprintf(screen,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
    if (logfile)
      fprintf(logfile,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
  }
 #endif
 }
 /* ---------------------------------------------------------------------- */
 Comm::~Comm()
 {
  memory->destroy(grid2proc);
  memory->destroy(xsplit);
  memory->destroy(ysplit);
  memory->destroy(zsplit);
  delete [] customfile;
  delete [] outfile;
 }
 /* ----------------------------------------------------------------------
   deep copy of arrays from old Comm class to new one
   all public/protected vectors/arrays in parent Comm class must be copied
   called from alternate constructor of child classes
   when new comm style is created from Input
 ------------------------------------------------------------------------- */
 void Comm::copy_arrays(Comm *oldcomm)
 {
  if (oldcomm->grid2proc) {
    memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
                   "comm:grid2proc");
    memcpy(&grid2proc[0][0][0],&oldcomm->grid2proc[0][0][0],
           (procgrid[0]*procgrid[1]*procgrid[2])*sizeof(int));
    memory->create(xsplit,procgrid[0]+1,"comm:xsplit");
    memory->create(ysplit,procgrid[1]+1,"comm:ysplit");
    memory->create(zsplit,procgrid[2]+1,"comm:zsplit");
    memcpy(xsplit,oldcomm->xsplit,(procgrid[0]+1)*sizeof(double));
    memcpy(ysplit,oldcomm->ysplit,(procgrid[1]+1)*sizeof(double));
    memcpy(zsplit,oldcomm->zsplit,(procgrid[2]+1)*sizeof(double));
  }
  if (customfile) {
    int n = strlen(oldcomm->customfile) + 1;
    customfile = new char[n];
    strcpy(customfile,oldcomm->customfile);
  }
  if (outfile) {
    int n = strlen(oldcomm->outfile) + 1;
    outfile = new char[n];
    strcpy(outfile,oldcomm->outfile);
  }
 }
 /* ----------------------------------------------------------------------
   modify communication params
   invoked from input script by comm_modify command
--- a/src/comm.h
+++ b/src/comm.h
@ -21,21 +21,15 @@ namespace LAMMPS_NS {
 class Comm : protected Pointers {
 public:
  int style;     // comm pattern: 0 = 6-way stencil, 1 = irregular tiling
-  int layout;    // current proc domains: 0 = logical bricks, 1 = general tiling
+  int layout;    // LAYOUT_UNIFORM = logical equal-sized bricks
-                 // can do style=1 on layout=0, but not vice versa
+                 // LAYOUT_NONUNIFORM = logical bricks, 
-  // NOTE: uniform needs to be subsumed into layout
+                 //                     but different sizes due to LB
-  int uniform;                      // 1 = equal subdomains, 0 = load-balanced
+                 // LAYOUT_TILED = general tiling, due to RCB LB
  int me,nprocs;                    // proc info
  int procgrid[3];                  // procs assigned in each dim of 3d grid
  int user_procgrid[3];             // user request for procs in each dim
  int myloc[3];                     // which proc I am in each dim
  int procneigh[3][2];              // my 6 neighboring procs, 0/1 = left/right
  int ghost_velocity;               // 1 if ghost atoms have velocity, 0 if not
  double *xsplit,*ysplit,*zsplit;   // fractional (0-1) sub-domain sizes
  double cutghost[3];               // cutoffs used for acquiring ghost atoms
  double cutghostuser;              // user-specified ghost cutoff
  int ***grid2proc;                 // which proc owns i,j,k loc in 3d grid
  int recv_from_partition;          // recv proc layout from this partition
  int send_to_partition;            // send my proc layout to this partition
                                    // -1 if no recv or send
@ -45,8 +39,24 @@ class Comm : protected Pointers {
  int maxexchange_fix;              // max contribution to exchange from Fixes
  int nthreads;                     // OpenMP threads per MPI process
  // public settings specific to layout = UNIFORM, NONUNIFORM
  int procgrid[3];                  // procs assigned in each dim of 3d grid
  int user_procgrid[3];             // user request for procs in each dim
  int myloc[3];                     // which proc I am in each dim
  int procneigh[3][2];              // my 6 neighboring procs, 0/1 = left/right
  double *xsplit,*ysplit,*zsplit;   // fractional (0-1) sub-domain sizes
  int ***grid2proc;                 // which proc owns i,j,k loc in 3d grid
  // public settings specific to layout = TILED
  double mysplit[3][2];             // fractional (0-1) bounds of my sub-domain
  // methods
  Comm(class LAMMPS *);
  virtual ~Comm();
  void copy_arrays(class Comm *);
  void modify_params(int, char **);
  void set_processors(int, char **);      // set 3d processor grid attributes
--- a/src/comm_brick.cpp
+++ b/src/comm_brick.cpp
@ -22,6 +22,7 @@
 #include "stdio.h"
 #include "stdlib.h"
 #include "comm_brick.h"
 #include "comm_tiled.h"
 #include "universe.h"
 #include "atom.h"
 #include "atom_vec.h"
@ -52,58 +53,57 @@ using namespace LAMMPS_NS;
 #define BIG 1.0e20
 enum{SINGLE,MULTI};               // same as in Comm
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 /* ---------------------------------------------------------------------- */
 CommBrick::CommBrick(LAMMPS *lmp) : Comm(lmp)
 {
-  style = layout = 0;
+  style = 0;
  layout = LAYOUT_UNIFORM;
  init_buffers();
 }
-  recv_from_partition = send_to_partition = -1;
+/* ---------------------------------------------------------------------- */
  otherflag = 0;
-  grid2proc = NULL;
+CommBrick::~CommBrick()
 {
  free_swap();
  if (mode == MULTI) {
    free_multi();
    memory->destroy(cutghostmulti);
  }
-  uniform = 1;
+  if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
  memory->sfree(sendlist);
  memory->destroy(maxsendlist);
  memory->destroy(buf_send);
  memory->destroy(buf_recv);
 }
 /* ---------------------------------------------------------------------- */
 CommBrick::CommBrick(LAMMPS *lmp, Comm *oldcomm) : Comm(*oldcomm)
 {
  if (oldcomm->layout == LAYOUT_TILED)
    error->all(FLERR,"Cannot change to comm_style brick from tiled layout");
  style = 0;
  layout = oldcomm->layout;
  copy_arrays(oldcomm);
  init_buffers();
 }
 /* ----------------------------------------------------------------------
   initialize comm buffers and other data structs local to CommBrick
 ------------------------------------------------------------------------- */
 void CommBrick::init_buffers()
 {
  multilo = multihi = NULL;
  cutghostmulti = NULL;
  // use of OpenMP threads
  // query OpenMP for number of threads/process set by user at run-time
  // if the OMP_NUM_THREADS environment variable is not set, we default
  // to using 1 thread. This follows the principle of the least surprise,
  // while practically all OpenMP implementations violate it by using
  // as many threads as there are (virtual) CPU cores by default.
  nthreads = 1;
 #ifdef _OPENMP
  if (lmp->kokkos) {
    nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
  } else if (getenv("OMP_NUM_THREADS") == NULL) {
    nthreads = 1;
    if (me == 0)
      error->warning(FLERR,"OMP_NUM_THREADS environment is not set.");
  } else {
    nthreads = omp_get_max_threads();
  }
  // enforce consistent number of threads across all MPI tasks
  MPI_Bcast(&nthreads,1,MPI_INT,0,world);
  if (!lmp->kokkos) omp_set_num_threads(nthreads);
  if (me == 0) {
    if (screen)
      fprintf(screen,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
    if (logfile)
      fprintf(logfile,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
  }
 #endif
  // initialize comm buffers & exchange memory
  // NOTE: allow for AtomVec to set maxexchange_atom, e.g. for atom_style body
  maxexchange_atom = maxexchange_fix = 0;
  maxexchange = maxexchange_atom + maxexchange_fix;
  bufextra = maxexchange + BUFEXTRA;
@ -125,26 +125,6 @@ CommBrick::CommBrick(LAMMPS *lmp) : Comm(lmp)
 /* ---------------------------------------------------------------------- */
 CommBrick::~CommBrick()
 {
  memory->destroy(grid2proc);
  free_swap();
  if (mode == MULTI) {
    free_multi();
    memory->destroy(cutghostmulti);
  }
  if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
  memory->sfree(sendlist);
  memory->destroy(maxsendlist);
  memory->destroy(buf_send);
  memory->destroy(buf_recv);
 }
 /* ---------------------------------------------------------------------- */
 void CommBrick::init()
 {
  triclinic = domain->triclinic;
@ -280,12 +260,12 @@ void CommBrick::setup()
  //   0 = to left, 1 = to right
  //   set equal to recvneed[idim][1/0] of neighbor proc
  // maxneed[idim] = max procs away any proc recvs atoms in either direction
-  // uniform = 1 = uniform sized sub-domains:
+  // layout = UNIFORM = uniform sized sub-domains:
  //   maxneed is directly computable from sub-domain size
  //     limit to procgrid-1 for non-PBC
  //   recvneed = maxneed except for procs near non-PBC
  //   sendneed = recvneed of neighbor on each side
-  // uniform = 0 = non-uniform sized sub-domains:
+  // layout = NONUNIFORM = non-uniform sized sub-domains:
  //   compute recvneed via updown() which accounts for non-PBC
  //   sendneed = recvneed of neighbor on each side
  //   maxneed via Allreduce() of recvneed
@ -293,7 +273,7 @@ void CommBrick::setup()
  int *periodicity = domain->periodicity;
  int left,right;
-  if (uniform) {
+  if (layout == LAYOUT_UNIFORM) {
    maxneed[0] = static_cast<int> (cutghost[0] * procgrid[0] / prd[0]) + 1;
    maxneed[1] = static_cast<int> (cutghost[1] * procgrid[1] / prd[1]) + 1;
    maxneed[2] = static_cast<int> (cutghost[2] * procgrid[2] / prd[2]) + 1;
@ -561,16 +541,15 @@ void CommBrick::forward_comm(int dummy)
    } else {
      if (comm_x_only) {
        if (sendnum[iswap])
-          n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+          avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                              x[firstrecv[iswap]],pbc_flag[iswap],
+                          x[firstrecv[iswap]],pbc_flag[iswap],pbc[iswap]);
                              pbc[iswap]);
      } else if (ghost_velocity) {
-        n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
+        avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
-                                buf_send,pbc_flag[iswap],pbc[iswap]);
+                            buf_send,pbc_flag[iswap],pbc[iswap]);
        avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send);
      } else {
-        n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
+        avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
+                        buf_send,pbc_flag[iswap],pbc[iswap]);
        avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
      }
    }
@ -620,10 +599,10 @@ void CommBrick::reverse_comm()
    } else {
      if (comm_f_only) {
        if (sendnum[iswap])
-            avec->unpack_reverse(sendnum[iswap],sendlist[iswap],
+          avec->unpack_reverse(sendnum[iswap],sendlist[iswap],
-                                f[firstrecv[iswap]]);
+                               f[firstrecv[iswap]]);
      } else {
-        n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
+        avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
        avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_send);
      }
    }
--- a/src/comm_brick.h
+++ b/src/comm_brick.h
@ -21,6 +21,7 @@ namespace LAMMPS_NS {
 class CommBrick : public Comm {
 public:
  CommBrick(class LAMMPS *);
  CommBrick(class LAMMPS *, class Comm *);
  virtual ~CommBrick();
  virtual void init();
@ -80,6 +81,7 @@ class CommBrick : public Comm {
  int maxexchange;                  // max # of datums/atom in exchange comm 
  int bufextra;                     // extra space beyond maxsend in send buffer
  void init_buffers();
  int updown(int, int, int, double, int, double *);
                                            // compare cutoff to procs
  virtual void grow_send(int, int);         // reallocate send buffer
--- a/src/comm_tiled.cpp
+++ b/src/comm_tiled.cpp
@ -11,15 +11,20 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "string.h"
 #include "lmptype.h"
 #include "comm_tiled.h"
 #include "comm_brick.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "neighbor.h"
 #include "modify.h"
 #include "fix.h"
 #include "compute.h"
 #include "output.h"
 #include "dump.h"
 #include "memory.h"
 #include "error.h"
@ -27,32 +32,93 @@
 using namespace LAMMPS_NS;
 #define BUFFACTOR 1.5
 #define BUFFACTOR 1.5
 #define BUFMIN 1000
 #define BUFEXTRA 1000
 enum{SINGLE,MULTI};               // same as in Comm
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 /* ---------------------------------------------------------------------- */
 CommTiled::CommTiled(LAMMPS *lmp) : Comm(lmp)
 {
  style = 1;
  layout = 0;
  error->all(FLERR,"Comm_style tiled is not yet supported");
  style = 1;
  layout = LAYOUT_UNIFORM;
  init_buffers();
 }
 /* ---------------------------------------------------------------------- */
 CommTiled::CommTiled(LAMMPS *lmp, Comm *oldcomm) : Comm(*oldcomm)
 {
  style = 1;
  layout = oldcomm->layout;
  copy_arrays(oldcomm);
  init_buffers();
 }
 /* ---------------------------------------------------------------------- */
 CommTiled::~CommTiled()
 {
  free_swap();
  if (sendlist) for (int i = 0; i < nswap; i++) memory->destroy(sendlist[i]);
  memory->sfree(sendlist);
  memory->destroy(maxsendlist);
  memory->destroy(buf_send);
  memory->destroy(buf_recv);
 }
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
   initialize comm buffers and other data structs local to CommTiled
   NOTE: if this is identical to CommBrick, put it into Comm ??
 ------------------------------------------------------------------------- */
 void CommTiled::init_buffers()
 {
  maxexchange = maxexchange_atom + maxexchange_fix;
  bufextra = maxexchange + BUFEXTRA;
  maxsend = BUFMIN;
  memory->create(buf_send,maxsend+bufextra,"comm:buf_send");
  maxrecv = BUFMIN;
  memory->create(buf_recv,maxrecv,"comm:buf_recv");
  nswap = 2 * domain->dimension;
  allocate_swap(nswap);
  //sendlist = (int **) memory->smalloc(nswap*sizeof(int *),"comm:sendlist");
  //memory->create(maxsendlist,nswap,"comm:maxsendlist");
  //for (int i = 0; i < nswap; i++) {
  //  maxsendlist[i] = BUFMIN;
  //  memory->create(sendlist[i],BUFMIN,"comm:sendlist[i]");
  //}
 }
 /* ----------------------------------------------------------------------
   NOTE: if this is nearly identical to CommBrick, put it into Comm ??
 ------------------------------------------------------------------------- */
 void CommTiled::init()
 {
  triclinic = domain->triclinic;
  map_style = atom->map_style;
  // temporary restrictions
  if (triclinic) 
    error->all(FLERR,"Cannot yet use comm_style tiled with triclinic box");
  if (domain->xperiodic || domain->yperiodic || 
      (domain->dimension == 2 && domain->zperiodic))
    error->all(FLERR,"Cannot yet use comm_style tiled with periodic box");
  if (mode == MULTI)
    error->all(FLERR,"Cannot yet use comm_style tiled with multi-mode comm");
  // comm_only = 1 if only x,f are exchanged in forward/reverse comm
  // comm_x_only = 0 if ghost_velocity since velocities are added
@ -72,51 +138,210 @@ void CommTiled::init()
  for (int i = 0; i < modify->nfix; i++)
    size_border += modify->fix[i]->comm_border;
  // maxexchange = max # of datums/atom in exchange communication
  // maxforward = # of datums in largest forward communication
  // maxreverse = # of datums in largest reverse communication
  // query pair,fix,compute,dump for their requirements
  // pair style can force reverse comm even if newton off
  maxexchange = BUFMIN + maxexchange_fix;
  maxforward = MAX(size_forward,size_border);
  maxreverse = size_reverse;
  if (force->pair) maxforward = MAX(maxforward,force->pair->comm_forward);
  if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse);
  for (int i = 0; i < modify->nfix; i++) {
    maxforward = MAX(maxforward,modify->fix[i]->comm_forward);
    maxreverse = MAX(maxreverse,modify->fix[i]->comm_reverse);
  }
  for (int i = 0; i < modify->ncompute; i++) {
    maxforward = MAX(maxforward,modify->compute[i]->comm_forward);
    maxreverse = MAX(maxreverse,modify->compute[i]->comm_reverse);
  }
  for (int i = 0; i < output->ndump; i++) {
    maxforward = MAX(maxforward,output->dump[i]->comm_forward);
    maxreverse = MAX(maxreverse,output->dump[i]->comm_reverse);
  }
  if (force->newton == 0) maxreverse = 0;
  if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse_off);
 }
 /* ----------------------------------------------------------------------
   setup spatial-decomposition communication patterns
-   function of neighbor cutoff(s) & cutghostuser & current box size
+   function of neighbor cutoff(s) & cutghostuser & current box size and tiling
   single mode sets slab boundaries (slablo,slabhi) based on max cutoff
   multi mode sets type-dependent slab boundaries (multilo,multihi)
 ------------------------------------------------------------------------- */
 void CommTiled::setup()
 {
-  // error on triclinic or multi?
+  int i;
  // set nswap = 2*dim
  // setup neighbor proc info for exchange()
  // setup nsendproc and nrecvproc bounts
  // setup sendproc and recvproc lists
  // setup sendboxes
  // reallocate requests and statuses
-  // check that cutoff is <= 1/2 of periodic box len?
+  int dimension;
  int *periodicity;
  double *prd,*sublo,*subhi,*boxlo,*boxhi;
-  // loop over dims
+  double cut = MAX(neighbor->cutneighmax,cutghostuser);
-  //  left:
+
-  //    construct ghost boxes
+  dimension = domain->dimension;
-  //      differnet in x,y,z
+  periodicity = domain->periodicity;
-  //      account for ghost borders in y,z
+  prd = domain->prd;
-  //      account for PBC by shifting
+  sublo = domain->sublo;
-  //      split into multiple boxes if straddles PBC
+  subhi = domain->subhi;
-  //    drop boxes down RCB tree
+  boxlo = domain->boxlo;
-  //      count unique procs they cover
+  boxhi = domain->boxhi;
-  //      what about self if crosses PBC
+  cutghost[0] = cutghost[1] = cutghost[2] = cut;
-  //      for each proc they cover:
+  
-  //        compute box I send it to left
+  if ((periodicity[0] && cut > prd[0]) ||
-  //        is a message I will recv from right (don't care about box)
+      (periodicity[1] && cut > prd[1]) ||
-  //    for ghost-extended boxes
+      (dimension == 3 && periodicity[2] && cut > prd[2]))
-  //      do not count procs that do not overlap my owned box at all
+    error->all(FLERR,"Communication cutoff for comm_style tiled "
-  //      only touching edge of my owned box does not count
+               "cannot exceed periodic box length");
-  //      in this case list I send to and recv from may be different?
+
-  //  same thing to right
+  // allocate overlap
  int *overlap;
  int noverlap,noverlap1,indexme;
  double lo1[3],hi1[3],lo2[3],hi2[3];
  int one,two;
  nswap = 0;
  for (int idim = 0; idim < dimension; idim++) {
    // ghost box in lower direction
    one = 1;
    lo1[0] = sublo[0]; lo1[1] = sublo[1]; lo1[2] = sublo[2];
    hi1[0] = subhi[0]; hi1[1] = subhi[1]; hi1[2] = subhi[2];
    lo1[idim] = sublo[idim] - cut;
    hi1[idim] = sublo[idim];
    two = 0;
    if (periodicity[idim] && lo1[idim] < boxlo[idim]) {
      two = 1;
      lo2[0] = sublo[0]; lo2[1] = sublo[1]; lo2[2] = sublo[2];
      hi2[0] = subhi[0]; hi2[1] = subhi[1]; hi2[2] = subhi[2];
      lo2[idim] = lo1[idim] + prd[idim];
      hi2[idim] = hi1[idim] + prd[idim];
      if (sublo[idim] == boxlo[idim]) {
        one = 0;
        hi2[idim] = boxhi[idim];
      }
    }
    indexme = -1;
    noverlap = 0;
    if (one) {
      if (layout == LAYOUT_UNIFORM) 
        box_drop_uniform(idim,lo1,hi1,noverlap,overlap,indexme);
      else if (layout == LAYOUT_NONUNIFORM) 
        box_drop_nonuniform(idim,lo1,hi1,noverlap,overlap,indexme);
      else
        box_drop_tiled(lo1,hi1,0,nprocs-1,noverlap,overlap,indexme);
    }
    noverlap1 = noverlap;
    if (two) {
      if (layout == LAYOUT_UNIFORM) 
        box_drop_uniform(idim,lo2,hi2,noverlap,overlap,indexme);
      else if (layout == LAYOUT_NONUNIFORM) 
        box_drop_nonuniform(idim,lo2,hi2,noverlap,overlap,indexme);
      else 
        box_drop_tiled(lo2,hi2,0,nprocs-1,noverlap,overlap,indexme);
    }
    // if this (self) proc is in overlap list, move it to end of list
    if (indexme >= 0) {
      int tmp = overlap[noverlap-1];
      overlap[noverlap-1] = overlap[indexme];
      overlap[indexme] = tmp;
    }
    // overlap how has list of noverlap procs
    // includes PBC effects
    if (overlap[noverlap-1] == me) sendself[nswap] = 1;
    else sendself[nswap] = 0;
    if (noverlap-sendself[nswap]) sendother[nswap] = 1;
    else sendother[nswap] = 0;
    nsendproc[nswap] = noverlap;
    for (i = 0; i < noverlap; i++) sendproc[nswap][i] = overlap[i];
    nrecvproc[nswap+1] = noverlap;
    for (i = 0; i < noverlap; i++) recvproc[nswap+1][i] = overlap[i];
    // compute sendbox for each of my sends
    // ibox = intersection of ghostbox with other proc's sub-domain
    // sendbox = ibox displaced by cutoff in dim
    // NOTE: need to extend send box in lower dims by cutoff
    // NOTE: this logic for overlapping boxes is not correct for sending
    double oboxlo[3],oboxhi[3],sbox[6];
    for (i = 0; i < noverlap; i++) {
      pbc_flag[nswap][i] = 0;
      pbc[nswap][i][0] = pbc[nswap][i][1] = pbc[nswap][i][2] =
        pbc[nswap][i][3] = pbc[nswap][i][4] = pbc[nswap][i][5] = 0;
      if (layout == LAYOUT_UNIFORM) 
        box_other_uniform(overlap[i],oboxlo,oboxhi);
      else if (layout == LAYOUT_NONUNIFORM) 
        box_other_nonuniform(overlap[i],oboxlo,oboxhi);
      else
        box_other_tiled(overlap[i],oboxlo,oboxhi);
      if (i < noverlap1) {
        sbox[0] = MAX(oboxlo[0],lo1[0]);
        sbox[1] = MAX(oboxlo[1],lo1[1]);
        sbox[2] = MAX(oboxlo[2],lo1[2]);
        sbox[3] = MIN(oboxhi[0],hi1[0]);
        sbox[4] = MIN(oboxhi[1],hi1[1]);
        sbox[5] = MIN(oboxhi[2],hi1[2]);
        sbox[idim] += cut;
        sbox[3+idim] += cut;
        if (sbox[idim] == lo1[idim]) sbox[idim] = sublo[idim];
      } else {
        pbc_flag[nswap][i] = 1;
        pbc[nswap][i][idim] = 1;
        sbox[0] = MAX(oboxlo[0],lo2[0]);
        sbox[1] = MAX(oboxlo[1],lo2[1]);
        sbox[2] = MAX(oboxlo[2],lo2[2]);
        sbox[3] = MIN(oboxhi[0],hi2[0]);
        sbox[4] = MIN(oboxhi[1],hi2[1]);
        sbox[5] = MIN(oboxhi[2],hi2[2]);
        sbox[idim] -= prd[idim] - cut;
        sbox[3+idim] -= prd[idim] + cut;
        if (sbox[idim] == lo1[idim]) sbox[idim] = sublo[idim];
      }
      if (idim >= 1) {
        if (sbox[0] == sublo[0]) sbox[0] -= cut;
        if (sbox[4] == subhi[0]) sbox[4] += cut;
      }
      if (idim == 2) {
        if (sbox[1] == sublo[1]) sbox[1] -= cut;
        if (sbox[5] == subhi[1]) sbox[5] += cut;
      }
      memcpy(sendbox[nswap][i],sbox,6*sizeof(double));
    }
    // ghost box in upper direction
    nswap += 2;
  }
  // reallocate requests and statuses to max of any swap
  // what need from decomp (RCB):
  // dropbox: return list of procs with overlap and overlapping boxes
  //   return n, proclist, boxlist
  // otherbox: bbox of another proc
  // dropatom: return what proc owns the atom coord
 }
 /* ----------------------------------------------------------------------
@ -126,46 +351,73 @@ void CommTiled::setup()
 void CommTiled::forward_comm(int dummy)
 {
-  int i,irecv,n;
+  int i,irecv,n,nsend,nrecv;
  MPI_Status status;
  AtomVec *avec = atom->avec;
  double **x = atom->x;
  // exchange data with another set of procs in each swap
-  // if first proc in set is self, then is just self across PBC, just copy
+  // post recvs from all procs except self
  // send data to all procs except self
  // copy data to self if sendself is set
  // wait on all procs except self and unpack received data
  // if comm_x_only set, exchange or copy directly to x, don't unpack
  for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap][0] != me) {
+    nsend = nsendproc[iswap] - sendself[iswap];
-      if (comm_x_only) {
+    nrecv = nrecvproc[iswap] - sendself[iswap];
-        for (i = 0; i < nrecvproc[iswap]; i++)
+
    if (comm_x_only) {
      if (sendother[iswap]) {
        for (i = 0; i < nrecv; i++)
          MPI_Irecv(x[firstrecv[iswap][i]],size_forward_recv[iswap][i],
                    MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-        for (i = 0; i < nsendproc[iswap]; i++) {
+        for (i = 0; i < nsend; i++) {
          n = avec->pack_comm(sendnum[iswap][i],sendlist[iswap][i],
                              buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
          MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap][i],0,world);
        }
-        MPI_Waitall(nrecvproc[iswap],requests,statuses);
+      }
-      } else if (ghost_velocity) {
+      if (sendself[iswap]) {
-        for (i = 0; i < nrecvproc[iswap]; i++)
+        avec->pack_comm(sendnum[iswap][nsend],sendlist[iswap][nsend],
                        x[firstrecv[iswap][nrecv]],pbc_flag[iswap][nsend],
                        pbc[iswap][nsend]);
      }
      if (sendother[iswap]) MPI_Waitall(nrecv,requests,statuses);
    } else if (ghost_velocity) {
      if (sendother[iswap]) {
        for (i = 0; i < nrecv; i++)
          MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
                    size_forward_recv[iswap][i],
                    MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-        for (i = 0; i < nsendproc[iswap]; i++) {
+        for (i = 0; i < nsend; i++) {
          n = avec->pack_comm_vel(sendnum[iswap][i],sendlist[iswap][i],
                                  buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
          MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap][i],0,world);
        }
-        for (i = 0; i < nrecvproc[iswap]; i++) {
+      }
-          MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
+
-          avec->unpack_comm_vel(recvnum[iswap][i],firstrecv[iswap][irecv],
+      if (sendself[iswap]) {
        avec->pack_comm_vel(sendnum[iswap][nsend],sendlist[iswap][nsend],
                            buf_send,pbc_flag[iswap][nsend],pbc[iswap][nsend]);
        avec->unpack_comm_vel(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
                              buf_send);
      }
      if (sendother[iswap]) {
        for (i = 0; i < nrecv; i++) {
          MPI_Waitany(nrecv,requests,&irecv,&status);
          avec->unpack_comm_vel(recvnum[iswap][irecv],firstrecv[iswap][irecv],
                                &buf_recv[forward_recv_offset[iswap][irecv]]);
        }
      }
-      } else {
+    } else {
-        for (i = 0; i < nrecvproc[iswap]; i++)
+      if (sendother[iswap]) {
        for (i = 0; i < nrecv; i++)
          MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
                    size_forward_recv[iswap][i],
                    MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
@ -174,27 +426,21 @@ void CommTiled::forward_comm(int dummy)
                              buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
          MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap][i],0,world);
        }
        for (i = 0; i < nrecvproc[iswap]; i++) {
          MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
          avec->unpack_comm(recvnum[iswap][i],firstrecv[iswap][irecv],
                            &buf_recv[forward_recv_offset[iswap][irecv]]);
        }
      }
-    } else {
+      if (sendself[iswap]) {
-      if (comm_x_only) {
+        avec->pack_comm(sendnum[iswap][nsend],sendlist[iswap][nsend],
-        if (sendnum[iswap][0])
+                        buf_send,pbc_flag[iswap][nsend],pbc[iswap][nsend]);
-          n = avec->pack_comm(sendnum[iswap][0],sendlist[iswap][0],
+        avec->unpack_comm(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
-                              x[firstrecv[iswap][0]],pbc_flag[iswap][0],
+                          buf_send);
-                              pbc[iswap][0]);
+      }
-      } else if (ghost_velocity) {
+
-        n = avec->pack_comm_vel(sendnum[iswap][0],sendlist[iswap][0],
+      if (sendother[iswap]) {
-                                buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
+        for (i = 0; i < nrecv; i++) {
-        avec->unpack_comm_vel(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
+          MPI_Waitany(nrecv,requests,&irecv,&status);
-      } else {
+          avec->unpack_comm(recvnum[iswap][irecv],firstrecv[iswap][irecv],
-        n = avec->pack_comm(sendnum[iswap][0],sendlist[iswap][0],
+                            &buf_recv[forward_recv_offset[iswap][irecv]]);
-                            buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
+        }
        avec->unpack_comm(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
      }
    }
  }
@ -207,57 +453,70 @@ void CommTiled::forward_comm(int dummy)
 void CommTiled::reverse_comm()
 {
-  int i,irecv,n;
+  int i,irecv,n,nsend,nrecv;
  MPI_Request request;
  MPI_Status status;
  AtomVec *avec = atom->avec;
  double **f = atom->f;
  // exchange data with another set of procs in each swap
-  // if first proc in set is self, then is just self across PBC, just copy
+  // post recvs from all procs except self
  // send data to all procs except self
  // copy data to self if sendself is set
  // wait on all procs except self and unpack received data
  // if comm_f_only set, exchange or copy directly from f, don't pack
  for (int iswap = nswap-1; iswap >= 0; iswap--) {
-    if (sendproc[iswap][0] != me) {
+    if (comm_f_only) {
-      if (comm_f_only) {
+      if (sendother[iswap]) {
-        for (i = 0; i < nsendproc[iswap]; i++)
+        for (i = 0; i < nsend; i++)
          MPI_Irecv(&buf_recv[reverse_recv_offset[iswap][i]],
                    size_reverse_recv[iswap][i],MPI_DOUBLE,
                    sendproc[iswap][i],0,world,&requests[i]);
-        for (i = 0; i < nrecvproc[iswap]; i++)
+        for (i = 0; i < nrecv; i++)
          MPI_Send(f[firstrecv[iswap][i]],size_reverse_send[iswap][i],
                   MPI_DOUBLE,recvproc[iswap][i],0,world);
-        for (i = 0; i < nsendproc[iswap]; i++) {
+      }
          MPI_Waitany(nsendproc[iswap],requests,&irecv,&status);
          avec->unpack_reverse(sendnum[iswap][irecv],sendlist[iswap][irecv],
                               &buf_recv[reverse_recv_offset[iswap][irecv]]);
        }
-      } else {
+      if (sendself[iswap]) {
-        for (i = 0; i < nsendproc[iswap]; i++)
+        avec->unpack_reverse(sendnum[iswap][nsend],sendlist[iswap][nsend],
-          MPI_Irecv(&buf_recv[reverse_recv_offset[iswap][i]],
+                             f[firstrecv[iswap][nrecv]]);
-                    size_reverse_recv[iswap][i],MPI_DOUBLE,
+      }
-                    sendproc[iswap][i],0,world,&requests[i]);
+
-        for (i = 0; i < nrecvproc[iswap]; i++) {
+      if (sendother[iswap]) {
-          n = avec->pack_reverse(recvnum[iswap][i],firstrecv[iswap][i],
+        for (i = 0; i < nsend; i++) {
-                                 buf_send);
+          MPI_Waitany(nsend,requests,&irecv,&status);
          MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap][i],0,world);
        }
        for (i = 0; i < nsendproc[iswap]; i++) {
          MPI_Waitany(nsendproc[iswap],requests,&irecv,&status);
          avec->unpack_reverse(sendnum[iswap][irecv],sendlist[iswap][irecv],
                               &buf_recv[reverse_recv_offset[iswap][irecv]]);
        }
      }
-
+      
    } else {
-      if (comm_f_only) {
+      if (sendother[iswap]) {
-        if (sendnum[iswap][0])
+        for (i = 0; i < nsend; i++)
-          avec->unpack_reverse(sendnum[iswap][0],sendlist[iswap][0],
+          MPI_Irecv(&buf_recv[reverse_recv_offset[iswap][i]],
-                               f[firstrecv[iswap][0]]);
+                    size_reverse_recv[iswap][i],MPI_DOUBLE,
-      } else {
+                    sendproc[iswap][i],0,world,&requests[i]);
-        n = avec->pack_reverse(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
+        for (i = 0; i < nrecv; i++) {
-        avec->unpack_reverse(sendnum[iswap][0],sendlist[iswap][0],buf_send);
+          n = avec->pack_reverse(recvnum[iswap][i],firstrecv[iswap][i],
                                 buf_send);
          MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap][i],0,world);
        }
      }
      if (sendself[iswap]) {
        avec->pack_reverse(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
                           buf_send);
        avec->unpack_reverse(sendnum[iswap][nsend],sendlist[iswap][nsend],
                             buf_send);
      }
      if (sendother[iswap]) {
        for (i = 0; i < nsend; i++) {
          MPI_Waitany(nsend,requests,&irecv,&status);
          avec->unpack_reverse(sendnum[iswap][irecv],sendlist[iswap][irecv],
                               &buf_recv[reverse_recv_offset[iswap][irecv]]);
        }
      }
    }
  }
@ -298,7 +557,7 @@ void CommTiled::exchange()
 void CommTiled::borders()
 {
-  int i,n,irecv,ngroup,nlast,nsend,rmaxswap;
+  int i,n,irecv,ngroup,nlast,nsend,nrecv,ncount,rmaxswap;
  double xlo,xhi,ylo,yhi,zlo,zhi;
  double *bbox;
  double **x;
@ -333,36 +592,40 @@ void CommTiled::borders()
      if (iswap < 2) nlast = atom->nlocal;
      else nlast = atom->nlocal + atom->nghost;
-      nsend = 0;
+      ncount = 0;
      for (i = 0; i < ngroup; i++)
        if (x[i][0] >= xlo && x[i][0] <= xhi &&
            x[i][1] >= ylo && x[i][1] <= yhi &&
            x[i][2] >= zlo && x[i][2] <= zhi) {
-          if (nsend == maxsendlist[iswap][i]) grow_list(iswap,i,nsend);
+          if (ncount == maxsendlist[iswap][i]) grow_list(iswap,i,ncount);
-          sendlist[iswap][i][nsend++] = i;
+          sendlist[iswap][i][ncount++] = i;
        }
      for (i = atom->nlocal; i < nlast; i++)
        if (x[i][0] >= xlo && x[i][0] <= xhi &&
            x[i][1] >= ylo && x[i][1] <= yhi &&
            x[i][2] >= zlo && x[i][2] <= zhi) {
-          if (nsend == maxsendlist[iswap][i]) grow_list(iswap,i,nsend);
+          if (ncount == maxsendlist[iswap][i]) grow_list(iswap,i,ncount);
-          sendlist[iswap][i][nsend++] = i;
+          sendlist[iswap][i][ncount++] = i;
        }
-      sendnum[iswap][i] = nsend;
+      sendnum[iswap][i] = ncount;
-      smax = MAX(smax,nsend);
+      smax = MAX(smax,ncount);
    }
-    // send sendnum counts to procs who recv from me
+    // send sendnum counts to procs who recv from me except self
    // copy data to self if sendself is set
-    if (sendproc[iswap][0] != me) {
+    nsend = nsendproc[iswap] - sendself[iswap];
-      for (i = 0; i < nrecvproc[iswap]; i++)
+    nrecv = nrecvproc[iswap] - sendself[iswap];
    if (sendother[iswap]) {
      for (i = 0; i < nrecv; i++)
        MPI_Irecv(&recvnum[iswap][i],1,MPI_INT,
                  recvproc[iswap][i],0,world,&requests[i]);
-      for (i = 0; i < nsendproc[iswap]; i++)
+      for (i = 0; i < nsend; i++)
        MPI_Send(&sendnum[iswap][i],1,MPI_INT,sendproc[iswap][i],0,world);
-      MPI_Waitall(nrecvproc[iswap],requests,statuses);
+    }
-
+    if (sendself[iswap]) recvnum[iswap][nrecv] = sendnum[iswap][nsend];
-    } else recvnum[iswap][0] = sendnum[iswap][0];
+    if (sendother[iswap]) MPI_Waitall(nrecv,requests,statuses);
    // setup other per swap/proc values from sendnum and recvnum
@ -390,54 +653,64 @@ void CommTiled::borders()
    // swap atoms with other procs using pack_border(), unpack_border()
-    if (sendproc[iswap][0] != me) {
+    if (ghost_velocity) {
-      for (i = 0; i < nsendproc[iswap]; i++) {
+      if (sendother[iswap]) {
-        if (ghost_velocity) {
+        for (i = 0; i < nrecv; i++)
-          for (i = 0; i < nrecvproc[iswap]; i++)
+          MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
-            MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
+                    recvnum[iswap][i]*size_border,
-                      recvnum[iswap][i]*size_border,
+                    MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-                      MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
+        for (i = 0; i < nsend; i++) {
-          for (i = 0; i < nsendproc[iswap]; i++) {
+          n = avec->pack_border_vel(sendnum[iswap][i],sendlist[iswap][i],
-            n = avec->pack_border_vel(sendnum[iswap][i],sendlist[iswap][i],
+                                    buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
-                                      buf_send,pbc_flag[iswap][i],
+          MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
-                                      pbc[iswap][i]);
+                   sendproc[iswap][i],0,world);
-            MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
+        }
-                     sendproc[iswap][i],0,world);
+      }
-          }
+
-          for (i = 0; i < nrecvproc[iswap]; i++) {
+      if (sendself[iswap]) {
-            MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
+        n = avec->pack_border_vel(sendnum[iswap][nsend],sendlist[iswap][nsend],
-            avec->unpack_border(recvnum[iswap][i],firstrecv[iswap][irecv],
+                                  buf_send,pbc_flag[iswap][nsend],
-                                &buf_recv[forward_recv_offset[iswap][irecv]]);
+                                  pbc[iswap][nsend]);
-          }
+        avec->unpack_border_vel(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
-          
+                                buf_send);
-        } else {
+      }
-          for (i = 0; i < nrecvproc[iswap]; i++)
+
-            MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
+      if (sendother[iswap]) {
-                      recvnum[iswap][i]*size_border,
+        for (i = 0; i < nrecv; i++) {
-                      MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
+          MPI_Waitany(nrecv,requests,&irecv,&status);
-          for (i = 0; i < nsendproc[iswap]; i++) {
+          avec->unpack_border(recvnum[iswap][irecv],firstrecv[iswap][irecv],
-            n = avec->pack_border(sendnum[iswap][i],sendlist[iswap][i],
+                              &buf_recv[forward_recv_offset[iswap][irecv]]);
                                  buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
            MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
                     sendproc[iswap][i],0,world);
          }
          for (i = 0; i < nrecvproc[iswap]; i++) {
            MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
            avec->unpack_border(recvnum[iswap][i],firstrecv[iswap][irecv],
                                &buf_recv[forward_recv_offset[iswap][irecv]]);
          }
        }
      }
    } else {
-      if (ghost_velocity) {
+      if (sendother[iswap]) {
-        n = avec->pack_border_vel(sendnum[iswap][0],sendlist[iswap][0],
+        for (i = 0; i < nrecv; i++)
-                                  buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
+          MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
-        avec->unpack_border_vel(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
+                    recvnum[iswap][i]*size_border,
-      } else {
+                    MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-        n = avec->pack_border(sendnum[iswap][0],sendlist[iswap][0],
+        for (i = 0; i < nsend; i++) {
-                              buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
+          n = avec->pack_border(sendnum[iswap][i],sendlist[iswap][i],
-        avec->unpack_border(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
+                                buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
          MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
                   sendproc[iswap][i],0,world);
        }
      }
      if (sendself[iswap]) {
        n = avec->pack_border(sendnum[iswap][nsend],sendlist[iswap][nsend],
                              buf_send,pbc_flag[iswap][nsend],
                              pbc[iswap][nsend]);
        avec->unpack_border(recvnum[iswap][nsend],firstrecv[iswap][nsend],
                            buf_send);
      }
      if (sendother[iswap]) {
        for (i = 0; i < nrecv; i++) {
          MPI_Waitany(nrecv,requests,&irecv,&status);
          avec->unpack_border(recvnum[iswap][irecv],firstrecv[iswap][irecv],
                              &buf_recv[forward_recv_offset[iswap][irecv]]);
        }
      }
    }
@ -786,6 +1059,64 @@ int CommTiled::exchange_variable(int n, double *inbuf, double *&outbuf)
  return nrecv;
 }
 /* ----------------------------------------------------------------------
   determine overlap list of Noverlap procs the lo/hi box overlaps
   overlap = non-zero area in common between box and proc sub-domain
   box is onwed by me and extends in dim
 ------------------------------------------------------------------------- */
 void CommTiled::box_drop_uniform(int dim, double *lo, double *hi, 
                                 int &noverlap, int *overlap, int &indexme)
 {
 }
 /* ----------------------------------------------------------------------
   determine overlap list of Noverlap procs the lo/hi box overlaps
   overlap = non-zero area in common between box and proc sub-domain
 ------------------------------------------------------------------------- */
 void CommTiled::box_drop_nonuniform(int dim, double *lo, double *hi, 
                                    int &noverlap, int *overlap, int &indexme)
 {
 }
 /* ----------------------------------------------------------------------
   determine overlap list of Noverlap procs the lo/hi box overlaps
   overlap = non-zero area in common between box and proc sub-domain
   recursive routine for traversing an RCB tree of cuts
 ------------------------------------------------------------------------- */
 void CommTiled::box_drop_tiled(double *lo, double *hi, 
                               int proclower, int procupper,
                               int &noverlap, int *overlap, int &indexme)
 {
  // end recursion when partition is a single proc
  // add proc to overlap list
  if (proclower == procupper) {
    if (proclower == me) indexme = noverlap;
    overlap[noverlap++] = proclower;
    return;
  }
  // drop box on each side of cut it extends beyond
  // use > and < criteria to not include procs it only touches
  // procmid = 1st processor in upper half of partition
  //         = location in tree that stores this cut
  // dim = 0,1,2 dimension of cut
  // cut = position of cut
  int procmid = proclower + (procupper - proclower) / 2 + 1;
  double cut = tree[procmid].cut;
  int dim = tree[procmid].dim;
  if (lo[dim] < cut) 
    box_drop_tiled(lo,hi,proclower,procmid-1,noverlap,overlap,indexme);
  if (hi[dim] > cut)
    box_drop_tiled(lo,hi,procmid,procupper,noverlap,overlap,indexme);
 }
 /* ----------------------------------------------------------------------
   realloc the size of the send buffer as needed with BUFFACTOR and bufextra
   if flag = 1, realloc
@ -825,6 +1156,42 @@ void CommTiled::grow_list(int iswap, int iwhich, int n)
               "comm:sendlist[iswap]");
 }
 /* ----------------------------------------------------------------------
   allocation of swap info
 ------------------------------------------------------------------------- */
 void CommTiled::allocate_swap(int n)
 {
  memory->create(sendnum,n,"comm:sendnum");
  memory->create(recvnum,n,"comm:recvnum");
  memory->create(sendproc,n,"comm:sendproc");
  memory->create(recvproc,n,"comm:recvproc");
  memory->create(size_forward_recv,n,"comm:size");
  memory->create(size_reverse_send,n,"comm:size");
  memory->create(size_reverse_recv,n,"comm:size");
  memory->create(firstrecv,n,"comm:firstrecv");
  memory->create(pbc_flag,n,"comm:pbc_flag");
  memory->create(pbc,n,6,"comm:pbc");
 }
 /* ----------------------------------------------------------------------
   free memory for swaps
 ------------------------------------------------------------------------- */
 void CommTiled::free_swap()
 {
  memory->destroy(sendnum);
  memory->destroy(recvnum);
  memory->destroy(sendproc);
  memory->destroy(recvproc);
  memory->destroy(size_forward_recv);
  memory->destroy(size_reverse_send);
  memory->destroy(size_reverse_recv);
  memory->destroy(firstrecv);
  memory->destroy(pbc_flag);
  memory->destroy(pbc);
 }
 /* ----------------------------------------------------------------------
   return # of bytes of allocated memory
 ------------------------------------------------------------------------- */
--- a/src/comm_tiled.h
+++ b/src/comm_tiled.h
@ -21,6 +21,7 @@ namespace LAMMPS_NS {
 class CommTiled : public Comm {
 public:
  CommTiled(class LAMMPS *);
  CommTiled(class LAMMPS *, class Comm *);
  virtual ~CommTiled();
  void init();
@ -46,15 +47,16 @@ class CommTiled : public Comm {
  bigint memory_usage();
 private:
  int nswap;                    // # of swaps to perform = 2*dim
  int *nsendproc,*nrecvproc;    // # of procs to send/recv to/from in each swap
  int triclinic;                    // 0 if domain is orthog, 1 if triclinic
  int map_style;                    // non-0 if global->local mapping is done
  int size_forward;                 // # of per-atom datums in forward comm
  int size_reverse;                 // # of datums in reverse comm
  int size_border;                  // # of datums in forward border comm
  int nswap;                    // # of swaps to perform = 2*dim
  int *nsendproc,*nrecvproc;    // # of procs to send/recv to/from in each swap
  int *sendother;               // 1 if send to any other proc in each swap
  int *sendself;                // 1 if send to self in each swap
  int **sendnum,**recvnum;      // # of atoms to send/recv per swap/proc
  int **sendproc,**recvproc;    // proc to send/recv to/from per swap/proc
  int **size_forward_recv;      // # of values to recv in each forward swap/proc
@ -77,6 +79,7 @@ class CommTiled : public Comm {
  int maxsend,maxrecv;          // current size of send/recv buffer
  int maxforward,maxreverse;    // max # of datums in forward/reverse comm
  int maxexchange;              // max # of datums/atom in exchange comm 
  int bufextra;                 // extra space beyond maxsend in send buffer
  MPI_Request *requests;
@ -84,9 +87,35 @@ class CommTiled : public Comm {
  int comm_x_only,comm_f_only;      // 1 if only exchange x,f in for/rev comm
  struct Tree {
    double cut;
    int dim;
  };
  Tree *tree;
  // info from RCB decomp
  double rcbcut;
  int rcbcutdim;
  double rcblo[3];
  double rcbhi[3];
  void init_buffers();
  void box_drop_uniform(int, double *, double *, int &, int *, int &);
  void box_drop_nonuniform(int, double *, double *, int &, int *, int &);
  void box_drop_tiled(double *, double *, int, int, int &, int *, int &);
  void box_other_uniform(int, double *, double *) {}
  void box_other_nonuniform(int, double *, double *) {}
  void box_other_tiled(int, double *, double *) {}
  void grow_send(int, int);         // reallocate send buffer
  void grow_recv(int);              // free/allocate recv buffer
  void grow_list(int, int, int);    // reallocate sendlist for one swap/proc
  void allocate_swap(int);          // allocate swap arrays
  void free_swap();                 // free swap arrays
 };
 }
--- a/src/domain.cpp
+++ b/src/domain.cpp
@ -44,14 +44,15 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 enum{NO_REMAP,X_REMAP,V_REMAP};    // same as fix_deform.cpp
 enum{IGNORE,WARN,ERROR};           // same as thermo.cpp
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 #define BIG   1.0e20
 #define SMALL 1.0e-4
 #define DELTAREGION 4
 #define BONDSTRETCH 1.1
 enum{NO_REMAP,X_REMAP,V_REMAP};    // same as fix_deform.cpp
 enum{IGNORE,WARN,ERROR};           // same as thermo.cpp
 /* ----------------------------------------------------------------------
   default is periodic
 ------------------------------------------------------------------------- */
@ -249,42 +250,56 @@ void Domain::set_global_box()
 /* ----------------------------------------------------------------------
   set lamda box params
   assumes global box is defined and proc assignment has been made
-   uses comm->xyz_split to define subbox boundaries in consistent manner
+   uses comm->xyz_split or comm->mysplit
     to define subbox boundaries in consistent manner
 ------------------------------------------------------------------------- */
 void Domain::set_lamda_box()
 {
-  int *myloc = comm->myloc;
+  if (comm->layout != LAYOUT_TILED) {
-  double *xsplit = comm->xsplit;
+    int *myloc = comm->myloc;
-  double *ysplit = comm->ysplit;
+    double *xsplit = comm->xsplit;
-  double *zsplit = comm->zsplit;
+    double *ysplit = comm->ysplit;
    double *zsplit = comm->zsplit;
-  sublo_lamda[0] = xsplit[myloc[0]];
+    sublo_lamda[0] = xsplit[myloc[0]];
-  subhi_lamda[0] = xsplit[myloc[0]+1];
+    subhi_lamda[0] = xsplit[myloc[0]+1];
    sublo_lamda[1] = ysplit[myloc[1]];
    subhi_lamda[1] = ysplit[myloc[1]+1];
    sublo_lamda[2] = zsplit[myloc[2]];
    subhi_lamda[2] = zsplit[myloc[2]+1];
-  sublo_lamda[1] = ysplit[myloc[1]];
+  } else {
-  subhi_lamda[1] = ysplit[myloc[1]+1];
+    double (*mysplit)[2] = comm->mysplit;
-  sublo_lamda[2] = zsplit[myloc[2]];
+    sublo_lamda[0] = mysplit[0][0];
-  subhi_lamda[2] = zsplit[myloc[2]+1];
+    subhi_lamda[0] = mysplit[0][1];
    sublo_lamda[1] = mysplit[1][0];
    subhi_lamda[1] = mysplit[1][1];
    sublo_lamda[2] = mysplit[2][0];
    subhi_lamda[2] = mysplit[2][1];
  }
 }
 /* ----------------------------------------------------------------------
   set local subbox params for orthogonal boxes
   assumes global box is defined and proc assignment has been made
-   uses comm->xyz_split to define subbox boundaries in consistent manner
+   uses comm->xyz_split or comm->mysplit
     to define subbox boundaries in consistent manner
   insure subhi[max] = boxhi
 ------------------------------------------------------------------------- */
 void Domain::set_local_box()
 {
-  int *myloc = comm->myloc;
+  if (triclinic) return;
-  int *procgrid = comm->procgrid;
+      
-  double *xsplit = comm->xsplit;
+  if (comm->layout != LAYOUT_TILED) {
-  double *ysplit = comm->ysplit;
+    int *myloc = comm->myloc;
-  double *zsplit = comm->zsplit;
+    int *procgrid = comm->procgrid;
    double *xsplit = comm->xsplit;
    double *ysplit = comm->ysplit;
    double *zsplit = comm->zsplit;
  if (triclinic == 0) {
    sublo[0] = boxlo[0] + xprd*xsplit[myloc[0]];
    if (myloc[0] < procgrid[0]-1)
      subhi[0] = boxlo[0] + xprd*xsplit[myloc[0]+1];
@ -299,6 +314,21 @@ void Domain::set_local_box()
    if (myloc[2] < procgrid[2]-1)
      subhi[2] = boxlo[2] + zprd*zsplit[myloc[2]+1];
    else subhi[2] = boxhi[2];
  } else {
    double (*mysplit)[2] = comm->mysplit;
    sublo[0] = boxlo[0] + xprd*mysplit[0][0];
    subhi[0] = boxlo[0] + xprd*mysplit[0][1];
    if (mysplit[0][1] == 1.0) subhi[0] = boxhi[0];
    sublo[1] = boxlo[1] + yprd*mysplit[1][0];
    subhi[1] = boxlo[1] + yprd*mysplit[1][1];
    if (mysplit[1][1] == 1.0) subhi[1] = boxhi[1];
    sublo[2] = boxlo[2] + zprd*mysplit[2][0];
    subhi[2] = boxlo[2] + zprd*mysplit[2][1];
    if (mysplit[2][1] == 1.0) subhi[2] = boxhi[2];
  }
 }
--- a/src/fix_balance.cpp
+++ b/src/fix_balance.cpp
@ -22,12 +22,14 @@
 #include "irregular.h"
 #include "force.h"
 #include "kspace.h"
 #include "rcb.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 using namespace FixConst;
-enum{SHIFT,RCB};
+enum{SHIFT,BISECTION};
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 /* ---------------------------------------------------------------------- */
@ -53,7 +55,7 @@ FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
  thresh = force->numeric(FLERR,arg[4]);
  if (strcmp(arg[5],"shift") == 0) lbstyle = SHIFT;
-  else if (strcmp(arg[5],"rcb") == 0) lbstyle = RCB;
+  else if (strcmp(arg[5],"rcb") == 0) lbstyle = BISECTION;
  else error->all(FLERR,"Illegal fix balance command");
  int iarg = 5;
@ -65,7 +67,7 @@ FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
    stopthresh = force->numeric(FLERR,arg[iarg+3]);
    if (stopthresh < 1.0) error->all(FLERR,"Illegal fix balance command");
    iarg += 4;
-  } else if (lbstyle == RCB) {
+  } else if (lbstyle == BISECTION) {
    iarg++;
  }
@ -97,14 +99,16 @@ FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
    }
  }
-  // create instance of Balance class and initialize it with params
+  if (lbstyle == BISECTION && comm->style == 0) 
-  // NOTE: do I need Balance instance if RCB?
+    error->all(FLERR,"Fix balance rcb cannot be used with comm_style brick");
-  // create instance of Irregular class
+
  // create instance of Balance class
  // if SHIFT, initialize it with params
  balance = new Balance(lmp);
  if (lbstyle == SHIFT) balance->shift_setup(bstr,nitermax,thresh);
-  if (lbstyle == RCB) error->all(FLERR,"Fix balance rcb is not yet supported");
+
  // create instance of Irregular class
  irregular = new Irregular(lmp);
@ -238,32 +242,33 @@ void FixBalance::rebalance()
 {
  imbprev = imbnow;
  // invoke balancer and reset comm->uniform flag
  int *sendproc;
  if (lbstyle == SHIFT) {
    itercount = balance->shift();
-  } else if (lbstyle == RCB) {
+    comm->layout = LAYOUT_NONUNIFORM;
  } else if (lbstyle == BISECTION) {
    sendproc = balance->bisection();
    comm->layout = LAYOUT_TILED;
  }
  // output of final result
  if (fp) balance->dumpout(update->ntimestep,fp);
  // reset comm->uniform flag
  // NOTE: this needs to change with RCB
  comm->uniform = 0;
  // reset proc sub-domains
  if (domain->triclinic) domain->set_lamda_box();
  domain->set_local_box();
  // if splits moved further than neighboring processor
  // move atoms to new processors via irregular()
-  // only needed if migrate_check() says an atom moves to far,
+  // only needed if migrate_check() says an atom moves to far
  // else allow caller's comm->exchange() to do it
  if (domain->triclinic) domain->x2lamda(atom->nlocal);
-  if (irregular->migrate_check()) irregular->migrate_atoms();
+  if (lbstyle == BISECTION) irregular->migrate_atoms(0,sendproc);
  else if (irregular->migrate_check()) irregular->migrate_atoms();
  if (domain->triclinic) domain->lamda2x(atom->nlocal);
  // invoke KSpace setup_grid() to adjust to new proc sub-domains
@ -303,5 +308,6 @@ double FixBalance::compute_vector(int i)
 double FixBalance::memory_usage()
 {
  double bytes = irregular->memory_usage();
  if (balance->rcb) bytes += balance->rcb->memory_usage();
  return bytes;
 }
--- a/src/fix_deform.cpp
+++ b/src/fix_deform.cpp
@ -987,3 +987,14 @@ void FixDeform::options(int narg, char **arg)
    } else error->all(FLERR,"Illegal fix deform command");
  }
 }
 /* ----------------------------------------------------------------------
   memory usage of Irregular
 ------------------------------------------------------------------------- */
 double FixDeform::memory_usage()
 {
  double bytes = 0.0;
  if (irregular) bytes += irregular->memory_usage();
  return bytes;
 }
--- a/src/fix_deform.h
+++ b/src/fix_deform.h
@ -35,6 +35,7 @@ class FixDeform : public Fix {
  void init();
  void pre_exchange();
  void end_of_step();
  double memory_usage();
 private:
  int triclinic,scaleflag,flipflag;
--- a/src/fix_nh.cpp
+++ b/src/fix_nh.cpp
@ -2271,3 +2271,14 @@ void FixNH::pre_exchange()
    domain->lamda2x(atom->nlocal);
  }
 }
 /* ----------------------------------------------------------------------
   memory usage of Irregular
 ------------------------------------------------------------------------- */
 double FixNH::memory_usage()
 {
  double bytes = 0.0;
  if (irregular) bytes += irregular->memory_usage();
  return bytes;
 }
--- a/src/fix_nh.h
+++ b/src/fix_nh.h
@ -39,6 +39,7 @@ class FixNH : public Fix {
  void reset_target(double);
  void reset_dt();
  virtual void *extract(const char*,int &);
  double memory_usage();
 protected:
  int dimension,which;
--- a/src/input.cpp
+++ b/src/input.cpp
@ -1159,19 +1159,15 @@ void Input::comm_style()
 {
  if (narg < 1) error->all(FLERR,"Illegal comm_style command");
  if (strcmp(arg[0],"brick") == 0) {
-    if (comm->layout)
+    if (comm->style == 0) return;
-      error->all(FLERR,
+    Comm *oldcomm = comm;
-                 "Cannot switch to comm style brick from "
+    comm = new CommBrick(lmp,oldcomm);
-                 "irregular tiling of proc domains");
+    delete oldcomm;
    comm = new CommBrick(lmp);
    // NOTE: this will lose load balancing info in old CommBrick
    if (domain->box_exist) {
      comm->set_proc_grid();
      domain->set_local_box();
    }
  } else if (strcmp(arg[0],"tiled") == 0) {
-    error->all(FLERR,"Comm_style tiled not yet supported");
+    if (comm->style == 1) return;
-    comm = new CommTiled(lmp);
+    Comm *oldcomm = comm;
    comm = new CommTiled(lmp,oldcomm);
    delete oldcomm;
  } else error->all(FLERR,"Illegal comm_style command");
 }
--- a/src/irregular.cpp
+++ b/src/irregular.cpp
@ -30,6 +30,8 @@ using namespace LAMMPS_NS;
 int *Irregular::proc_recv_copy;
 int compare_standalone(const void *, const void *);
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 #define BUFFACTOR 1.5
 #define BUFMIN 1000
 #define BUFEXTRA 1000
@ -43,13 +45,26 @@ Irregular::Irregular(LAMMPS *lmp) : Pointers(lmp)
  triclinic = domain->triclinic;
  map_style = atom->map_style;
  procgrid = comm->procgrid;
  grid2proc = comm->grid2proc;
-  aplan = NULL;
+  // migrate work vectors
  dplan = NULL;
-  // initialize buffers for atom comm, not used for datum comm
+  maxlocal = 0;
  mproclist = NULL;
  msizes = NULL;
  // send buffers
  maxdbuf = 0;
  dbuf = NULL;
  maxbuf = 0;
  buf = NULL;
  // universal work vectors
  memory->create(work1,nprocs,"irregular:work1");
  memory->create(work2,nprocs,"irregular:work2");
  // initialize buffers for migrate atoms, not used for datum comm
  // these can persist for multiple irregular operations
  maxsend = BUFMIN;
@ -62,9 +77,12 @@ Irregular::Irregular(LAMMPS *lmp) : Pointers(lmp)
 Irregular::~Irregular()
 {
-  if (aplan) destroy_atom();
+  memory->destroy(mproclist);
-  if (dplan) destroy_data();
+  memory->destroy(msizes);
-
+  memory->destroy(dbuf);
  memory->destroy(buf);
  memory->destroy(work1);
  memory->destroy(work2);
  memory->destroy(buf_send);
  memory->destroy(buf_recv);
 }
@ -74,11 +92,13 @@ Irregular::~Irregular()
   can be used in place of comm->exchange()
   unlike exchange(), allows atoms to have moved arbitrarily long distances
   sets up irregular plan, invokes it, destroys it
   sortflag = flag for sorting order of received messages by proc ID
   procassign = non-NULL if already know procs atoms are assigned to (from RCB)
   atoms MUST be remapped to be inside simulation box before this is called
   for triclinic: atoms must be in lamda coords (0-1) before this is called
 ------------------------------------------------------------------------- */
-void Irregular::migrate_atoms(int sortflag)
+void Irregular::migrate_atoms(int sortflag, int *procassign)
 {
  // clear global->local map since atoms move to new procs
  // clear old ghosts so map_set() at end will operate only on local atoms
@ -101,13 +121,16 @@ void Irregular::migrate_atoms(int sortflag)
    subhi = domain->subhi_lamda;
  }
-  uniform = comm->uniform;
+  layout = comm->layout;
  xsplit = comm->xsplit;
  ysplit = comm->ysplit;
  zsplit = comm->zsplit;
  boxlo = domain->boxlo;
  prd = domain->prd;
  procgrid = comm->procgrid;
  grid2proc = comm->grid2proc;
  // loop over atoms, flag any that are not in my sub-box
  // fill buffer with atoms leaving my box, using < and >=
  // assign which proc it belongs to via coord2proc()
@ -119,41 +142,63 @@ void Irregular::migrate_atoms(int sortflag)
  double **x = atom->x;
  int nlocal = atom->nlocal;
  if (nlocal > maxlocal) {
    maxlocal = nlocal;
    memory->destroy(mproclist);
    memory->destroy(msizes);
    memory->create(mproclist,maxlocal,"irregular:mproclist");
    memory->create(msizes,maxlocal,"irregular:msizes");
  }
  int igx,igy,igz;
  int nsend = 0;
  int nsendatom = 0;
  int *sizes = new int[nlocal];
  int *proclist = new int[nlocal];
  int igx,igy,igz;
  int i = 0;
-  while (i < nlocal) {
+
-    if (x[i][0] < sublo[0] || x[i][0] >= subhi[0] ||
+  if (procassign) {
-        x[i][1] < sublo[1] || x[i][1] >= subhi[1] ||
+    while (i < nlocal) {
-        x[i][2] < sublo[2] || x[i][2] >= subhi[2]) {
+      if (procassign[i] == me) i++;
-      proclist[nsendatom] = coord2proc(x[i],igx,igy,igz);
+      else {
-      if (proclist[nsendatom] != me) {
+        mproclist[nsendatom] = procassign[i];
        if (nsend > maxsend) grow_send(nsend,1);
-        sizes[nsendatom] = avec->pack_exchange(i,&buf_send[nsend]);
+        msizes[nsendatom] = avec->pack_exchange(i,&buf_send[nsend]);
-        nsend += sizes[nsendatom];
+        nsend += msizes[nsendatom];
        nsendatom++;
        avec->copy(nlocal-1,i,1);
        procassign[i] = procassign[nlocal-1];
        nlocal--;
      }
    }
  } else {
    while (i < nlocal) {
      if (x[i][0] < sublo[0] || x[i][0] >= subhi[0] ||
          x[i][1] < sublo[1] || x[i][1] >= subhi[1] ||
          x[i][2] < sublo[2] || x[i][2] >= subhi[2]) {
        mproclist[nsendatom] = coord2proc(x[i],igx,igy,igz);
        if (mproclist[nsendatom] == me) i++;
        else {
          if (nsend > maxsend) grow_send(nsend,1);
          msizes[nsendatom] = avec->pack_exchange(i,&buf_send[nsend]);
          nsend += msizes[nsendatom];
          nsendatom++;
          avec->copy(nlocal-1,i,1);
          nlocal--;
        }
      } else i++;
-    } else i++;
+    }
  }
  atom->nlocal = nlocal;
  // create irregular communication plan, perform comm, destroy plan
  // returned nrecv = size of buffer needed for incoming atoms
-  int nrecv = create_atom(nsendatom,sizes,proclist,sortflag);
+  int nrecv = create_atom(nsendatom,msizes,mproclist,sortflag);
  if (nrecv > maxrecv) grow_recv(nrecv);
-  exchange_atom(buf_send,sizes,buf_recv);
+  exchange_atom(buf_send,msizes,buf_recv);
  destroy_atom();
  delete [] sizes;
  delete [] proclist;
  // add received atoms to my list
  int m = 0;
@ -174,6 +219,11 @@ void Irregular::migrate_atoms(int sortflag)
 int Irregular::migrate_check()
 {
  // migrate required if comm layout is tiled
  // cannot use myloc[] logic below
  if (comm->layout == LAYOUT_TILED) return 1;
  // subbox bounds for orthogonal or triclinic box
  // other comm/domain data used by coord2proc()
@ -186,13 +236,16 @@ int Irregular::migrate_check()
    subhi = domain->subhi_lamda;
  }
-  uniform = comm->uniform;
+  layout = comm->layout;
  xsplit = comm->xsplit;
  ysplit = comm->ysplit;
  zsplit = comm->zsplit;
  boxlo = domain->boxlo;
  prd = domain->prd;
  procgrid = comm->procgrid;
  grid2proc = comm->grid2proc;
  // loop over atoms, check for any that are not in my sub-box
  // assign which proc it belongs to via coord2proc()
  // if logical igx,igy,igz of newproc > one away from myloc, set flag = 1
@ -250,6 +303,7 @@ int Irregular::migrate_check()
   n = # of atoms to send
   sizes = # of doubles for each atom
   proclist = proc to send each atom to (not including self)
   sortflag = flag for sorting order of received messages by proc ID
   return total # of doubles I will recv (not including self)
 ------------------------------------------------------------------------- */
@ -257,51 +311,60 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
 {
  int i;
-  // allocate plan and work vectors
+  // setup for collective comm
-
+  // work1 = 1 for procs I send a message to, not including self
-  if (aplan) destroy_atom();
+  // work2 = 1 for all procs, used for ReduceScatter
  aplan = (PlanAtom *) memory->smalloc(sizeof(PlanAtom),"irregular:aplan");
  int *list = new int[nprocs];
  int *count = new int[nprocs];
  // nrecv = # of messages I receive
  for (i = 0; i < nprocs; i++) {
-    list[i] = 0;
+    work1[i] = 0;
-    count[i] = 1;
+    work2[i] = 1;
  }
-  for (i = 0; i < n; i++) list[proclist[i]] = 1;
+  for (i = 0; i < n; i++) work1[proclist[i]] = 1;
  work1[me] = 0;
-  int nrecv;
+  // nrecv_proc = # of procs I receive messages from, not including self
-  MPI_Reduce_scatter(list,&nrecv,count,MPI_INT,MPI_SUM,world);
+  // options for performing ReduceScatter operation
  // some are more efficient on some machines at big sizes
 #ifdef LAMMPS_RS_ALLREDUCE_INPLACE
  MPI_Allreduce(MPI_IN_PLACE,work1,nprocs,MPI_INT,MPI_SUM,world);
  nrecv_proc = work1[me];
 #else 
 #ifdef LAMMPS_RS_ALLREDUCE
  MPI_Allreduce(work1,work2,nprocs,MPI_INT,MPI_SUM,world);
  nrecv_proc = work2[me];
 #else
  MPI_Reduce_scatter(work1,&nrecv_proc,work2,MPI_INT,MPI_SUM,world);
 #endif
 #endif
  // allocate receive arrays
-  int *proc_recv = new int[nrecv];
+  proc_recv = new int[nrecv_proc];
-  int *length_recv = new int[nrecv];
+  length_recv = new int[nrecv_proc];
-  MPI_Request *request = new MPI_Request[nrecv];
+  request = new MPI_Request[nrecv_proc];
-  MPI_Status *status = new MPI_Status[nrecv];
+  status = new MPI_Status[nrecv_proc];
-  // nsend = # of messages I send
+  // nsend_proc = # of messages I send
-  for (i = 0; i < nprocs; i++) list[i] = 0;
+  for (i = 0; i < nprocs; i++) work1[i] = 0;
-  for (i = 0; i < n; i++) list[proclist[i]] += sizes[i];
+  for (i = 0; i < n; i++) work1[proclist[i]] += sizes[i];
-  int nsend = 0;
+  nsend_proc = 0;
  for (i = 0; i < nprocs; i++)
-    if (list[i]) nsend++;
+    if (work1[i]) nsend_proc++;
  // allocate send arrays
-  int *proc_send = new int[nsend];
+  proc_send = new int[nsend_proc];
-  int *length_send = new int[nsend];
+  length_send = new int[nsend_proc];
-  int *num_send = new int[nsend];
+  num_send = new int[nsend_proc];
-  int *index_send = new int[n];
+  index_send = new int[n];
-  int *offset_send = new int[n];
+  offset_send = new int[n];
  // list still stores size of message for procs I send to
  // proc_send = procs I send to
-  // length_send = total size of message I send to each proc
+  // length_send = # of doubles I send to each proc
  // to balance pattern of send messages:
  //   each proc begins with iproc > me, continues until iproc = me
  // reset list to store which send message each proc corresponds to
@ -311,81 +374,81 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
  for (i = 0; i < nprocs; i++) {
    iproc++;
    if (iproc == nprocs) iproc = 0;
-    if (list[iproc] > 0) {
+    if (work1[iproc] > 0) {
      proc_send[isend] = iproc;
-      length_send[isend] = list[iproc];
+      length_send[isend] = work1[iproc];
-      list[iproc] = isend;
+      work1[iproc] = isend;
      isend++;
    }
  }
  // num_send = # of atoms I send to each proc
-  for (i = 0; i < nsend; i++) num_send[i] = 0;
+  for (i = 0; i < nsend_proc; i++) num_send[i] = 0;
  for (i = 0; i < n; i++) {
-    isend = list[proclist[i]];
+    isend = work1[proclist[i]];
    num_send[isend]++;
  }
-  // count = offsets into index_send for each proc I send to
+  // work2 = offsets into index_send for each proc I send to
  // index_send = list of which atoms to send to each proc
  //   1st N1 values are atom indices for 1st proc,
  //   next N2 values are atom indices for 2nd proc, etc
  // offset_send = where each atom starts in send buffer
-  count[0] = 0;
+  work2[0] = 0;
-  for (i = 1; i < nsend; i++) count[i] = count[i-1] + num_send[i-1];
+  for (i = 1; i < nsend_proc; i++) work2[i] = work2[i-1] + num_send[i-1];
  for (i = 0; i < n; i++) {
-    isend = list[proclist[i]];
+    isend = work1[proclist[i]];
-    index_send[count[isend]++] = i;
+    index_send[work2[isend]++] = i;
    if (i) offset_send[i] = offset_send[i-1] + sizes[i-1];
    else offset_send[i] = 0;
  }
  // tell receivers how much data I send
-  // sendmax = largest # of doubles I send in a single message
+  // sendmax_proc = # of doubles I send in largest single message
-  int sendmax = 0;
+  sendmax_proc = 0;
-  for (i = 0; i < nsend; i++) {
+  for (i = 0; i < nsend_proc; i++) {
    MPI_Send(&length_send[i],1,MPI_INT,proc_send[i],0,world);
-    sendmax = MAX(sendmax,length_send[i]);
+    sendmax_proc = MAX(sendmax_proc,length_send[i]);
  }
  // receive incoming messages
  // proc_recv = procs I recv from
-  // length_recv = total size of message each proc sends me
+  // length_recv = # of doubles each proc sends me
-  // nrecvsize = total size of data I recv
+  // nrecvsize = total size of atom data I recv
  int nrecvsize = 0;
-  for (i = 0; i < nrecv; i++) {
+  for (i = 0; i < nrecv_proc; i++) {
    MPI_Recv(&length_recv[i],1,MPI_INT,MPI_ANY_SOURCE,0,world,status);
    proc_recv[i] = status->MPI_SOURCE;
    nrecvsize += length_recv[i];
  }
-  // sort proc_recv and num_recv by proc ID if requested
+  // sort proc_recv and length_recv by proc ID if requested
  // useful for debugging to insure reproducible ordering of received atoms
  // invoke by adding final arg = 1 to create_atom() call in migrate_atoms()
  if (sortflag) {
-    int *order = new int[nrecv];
+    int *order = new int[nrecv_proc];
-    int *proc_recv_ordered = new int[nrecv];
+    int *proc_recv_ordered = new int[nrecv_proc];
-    int *length_recv_ordered = new int[nrecv];
+    int *length_recv_ordered = new int[nrecv_proc];
-    for (i = 0; i < nrecv; i++) order[i] = i;
+    for (i = 0; i < nrecv_proc; i++) order[i] = i;
    proc_recv_copy = proc_recv;
-    qsort(order,nrecv,sizeof(int),compare_standalone);
+    qsort(order,nrecv_proc,sizeof(int),compare_standalone);
    int j;
-    for (i = 0; i < nrecv; i++) {
+    for (i = 0; i < nrecv_proc; i++) {
      j = order[i];
      proc_recv_ordered[i] = proc_recv[j];
      length_recv_ordered[i] = length_recv[j];
    }
-    memcpy(proc_recv,proc_recv_ordered,nrecv*sizeof(int));
+    memcpy(proc_recv,proc_recv_ordered,nrecv_proc*sizeof(int));
-    memcpy(length_recv,length_recv_ordered,nrecv*sizeof(int));
+    memcpy(length_recv,length_recv_ordered,nrecv_proc*sizeof(int));
    delete [] order;
    delete [] proc_recv_ordered;
    delete [] length_recv_ordered;
@ -396,27 +459,7 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
  MPI_Barrier(world);
-  // free work vectors
+  // return size of atom data I will receive
  delete [] count;
  delete [] list;
  // initialize plan
  aplan->nsend = nsend;
  aplan->nrecv = nrecv;
  aplan->sendmax = sendmax;
  aplan->proc_send = proc_send;
  aplan->length_send = length_send;
  aplan->num_send = num_send;
  aplan->index_send = index_send;
  aplan->offset_send = offset_send;
  aplan->proc_recv = proc_recv;
  aplan->length_recv = length_recv;
  aplan->request = request;
  aplan->status = status;
  return nrecvsize;
 }
@ -445,217 +488,226 @@ int compare_standalone(const void *iptr, const void *jptr)
 void Irregular::exchange_atom(double *sendbuf, int *sizes, double *recvbuf)
 {
-  int i,m,n,offset,num_send;
+  int i,m,n,offset,count;
  // post all receives
  offset = 0;
-  for (int irecv = 0; irecv < aplan->nrecv; irecv++) {
+  for (int irecv = 0; irecv < nrecv_proc; irecv++) {
-    MPI_Irecv(&recvbuf[offset],aplan->length_recv[irecv],MPI_DOUBLE,
+    MPI_Irecv(&recvbuf[offset],length_recv[irecv],MPI_DOUBLE,
-              aplan->proc_recv[irecv],0,world,&aplan->request[irecv]);
+              proc_recv[irecv],0,world,&request[irecv]);
-    offset += aplan->length_recv[irecv];
+    offset += length_recv[irecv];
  }
-  // allocate buf for largest send
+  // reallocate buf for largest send if necessary
-  double *buf;
+  if (sendmax_proc > maxdbuf) {
-  memory->create(buf,aplan->sendmax,"irregular:buf");
+    memory->destroy(dbuf);
    maxdbuf = sendmax_proc;
    memory->create(dbuf,maxdbuf,"irregular:dbuf");
  }
  // send each message
  // pack buf with list of atoms
  // m = index of atom in sendbuf
  int *index_send = aplan->index_send;
  int nsend = aplan->nsend;
  n = 0;
-
+  for (int isend = 0; isend < nsend_proc; isend++) {
  for (int isend = 0; isend < nsend; isend++) {
    offset = 0;
-    num_send = aplan->num_send[isend];
+    count = num_send[isend];
-    for (i = 0; i < num_send; i++) {
+    for (i = 0; i < count; i++) {
      m = index_send[n++];
-      memcpy(&buf[offset],&sendbuf[aplan->offset_send[m]],
+      memcpy(&dbuf[offset],&sendbuf[offset_send[m]],sizes[m]*sizeof(double));
             sizes[m]*sizeof(double));
      offset += sizes[m];
    }
-    MPI_Send(buf,aplan->length_send[isend],MPI_DOUBLE,
+    MPI_Send(dbuf,length_send[isend],MPI_DOUBLE,proc_send[isend],0,world);
             aplan->proc_send[isend],0,world);
  }
  // free temporary send buffer
  memory->destroy(buf);
  // wait on all incoming messages
-  if (aplan->nrecv) MPI_Waitall(aplan->nrecv,aplan->request,aplan->status);
+  if (nrecv_proc) MPI_Waitall(nrecv_proc,request,status);
 }
 /* ----------------------------------------------------------------------
-   destroy communication plan for atoms
+   destroy vectors in communication plan for atoms
 ------------------------------------------------------------------------- */
 void Irregular::destroy_atom()
 {
-  delete [] aplan->proc_send;
+  delete [] proc_send;
-  delete [] aplan->length_send;
+  delete [] length_send;
-  delete [] aplan->num_send;
+  delete [] num_send;
-  delete [] aplan->index_send;
+  delete [] index_send;
-  delete [] aplan->offset_send;
+  delete [] offset_send;
-  delete [] aplan->proc_recv;
+  delete [] proc_recv;
-  delete [] aplan->length_recv;
+  delete [] length_recv;
-  delete [] aplan->request;
+  delete [] request;
-  delete [] aplan->status;
+  delete [] status;
  memory->sfree(aplan);
  aplan = NULL;
 }
 /* ----------------------------------------------------------------------
-   create a communication plan for datums
+   create communication plan based on list of datums of uniform size
   n = # of datums to send
-   proclist = proc to send each datum to (including self)
+   proclist = proc to send each datum to, can include self
-   return total # of datums I will recv (including self)
+   sortflag = flag for sorting order of received messages by proc ID
   return total # of datums I will recv, including any to self
 ------------------------------------------------------------------------- */
-int Irregular::create_data(int n, int *proclist)
+int Irregular::create_data(int n, int *proclist, int sortflag)
 {
  int i,m;
-  // allocate plan and work vectors
+  // setup for collective comm
-
+  // work1 = 1 for procs I send a message to, not including self
-  dplan = (PlanData *) memory->smalloc(sizeof(PlanData),"irregular:dplan");
+  // work2 = 1 for all procs, used for ReduceScatter
  int *list = new int[nprocs];
  int *count = new int[nprocs];
  // nrecv = # of messages I receive
  for (i = 0; i < nprocs; i++) {
-    list[i] = 0;
+    work1[i] = 0;
-    count[i] = 1;
+    work2[i] = 1;
  }
-  for (i = 0; i < n; i++) list[proclist[i]] = 1;
+  for (i = 0; i < n; i++) work1[proclist[i]] = 1;
  work1[me] = 0;
-  int nrecv;
+  // nrecv_proc = # of procs I receive messages from, not including self
-  MPI_Reduce_scatter(list,&nrecv,count,MPI_INT,MPI_SUM,world);
+  // options for performing ReduceScatter operation
-  if (list[me]) nrecv--;
+  // some are more efficient on some machines at big sizes
 #ifdef LAMMPS_RS_ALLREDUCE_INPLACE
  MPI_Allreduce(MPI_IN_PLACE,work1,nprocs,MPI_INT,MPI_SUM,world);
  nrecv_proc = work1[me];
 #else 
 #ifdef LAMMPS_RS_ALLREDUCE
  MPI_Allreduce(work1,work2,nprocs,MPI_INT,MPI_SUM,world);
  nrecv_proc = work2[me];
 #else
  MPI_Reduce_scatter(work1,&nrecv_proc,work2,MPI_INT,MPI_SUM,world);
 #endif
 #endif
  // allocate receive arrays
-  int *proc_recv = new int[nrecv];
+  proc_recv = new int[nrecv_proc];
-  int *num_recv = new int[nrecv];
+  num_recv = new int[nrecv_proc];
-  MPI_Request *request = new MPI_Request[nrecv];
+  request = new MPI_Request[nrecv_proc];
-  MPI_Status *status = new MPI_Status[nrecv];
+  status = new MPI_Status[nrecv_proc];
-  // nsend = # of messages I send
+  // work1 = # of datums I send to each proc, including self
  // nsend_proc = # of procs I send messages to, not including self
-  for (i = 0; i < nprocs; i++) list[i] = 0;
+  for (i = 0; i < nprocs; i++) work1[i] = 0;
-  for (i = 0; i < n; i++) list[proclist[i]]++;
+  for (i = 0; i < n; i++) work1[proclist[i]]++;
-  int nsend = 0;
+  nsend_proc = 0;
  for (i = 0; i < nprocs; i++)
-    if (list[i]) nsend++;
+    if (work1[i]) nsend_proc++;
-  if (list[me]) nsend--;
+  if (work1[me]) nsend_proc--;
  // allocate send and self arrays
-  int *proc_send = new int[nsend];
+  proc_send = new int[nsend_proc];
-  int *num_send = new int[nsend];
+  num_send = new int[nsend_proc];
-  int *index_send = new int[n-list[me]];
+  index_send = new int[n-work1[me]];
-  int *index_self = new int[list[me]];
+  index_self = new int[work1[me]];
  // proc_send = procs I send to
  // num_send = # of datums I send to each proc
  // num_self = # of datums I copy to self
  // to balance pattern of send messages:
  //   each proc begins with iproc > me, continues until iproc = me
-  // reset list to store which send message each proc corresponds to
+  // reset work1 to store which send message each proc corresponds to
  int num_self;
  int iproc = me;
  int isend = 0;
  for (i = 0; i < nprocs; i++) {
    iproc++;
    if (iproc == nprocs) iproc = 0;
-    if (iproc == me) num_self = list[iproc];
+    if (iproc == me) {
-    else if (list[iproc] > 0) {
+      num_self = work1[iproc];
      work1[iproc] = 0;
    } else if (work1[iproc] > 0) {
      proc_send[isend] = iproc;
-      num_send[isend] = list[iproc];
+      num_send[isend] = work1[iproc];
-      list[iproc] = isend;
+      work1[iproc] = isend;
      isend++;
    }
  }
  list[me] = 0;
-  // count = offsets into index_send for each proc I send to
+  // work2 = offsets into index_send for each proc I send to
  // m = ptr into index_self
  // index_send = list of which datums to send to each proc
  //   1st N1 values are datum indices for 1st proc,
  //   next N2 values are datum indices for 2nd proc, etc
  // index_self = list of which datums to copy to self
-  count[0] = 0;
+  work2[0] = 0;
-  for (i = 1; i < nsend; i++) count[i] = count[i-1] + num_send[i-1];
+  for (i = 1; i < nsend_proc; i++) work2[i] = work2[i-1] + num_send[i-1];
  m = 0;
  for (i = 0; i < n; i++) {
    iproc = proclist[i];
    if (iproc == me) index_self[m++] = i;
    else {
-      isend = list[iproc];
+      isend = work1[iproc];
-      index_send[count[isend]++] = i;
+      index_send[work2[isend]++] = i;
    }
  }
  // tell receivers how much data I send
-  // sendmax = largest # of datums I send in a single message
+  // sendmax_proc = largest # of datums I send in a single message
-  int sendmax = 0;
+  sendmax_proc = 0;
-  for (i = 0; i < nsend; i++) {
+  for (i = 0; i < nsend_proc; i++) {
    MPI_Send(&num_send[i],1,MPI_INT,proc_send[i],0,world);
-    sendmax = MAX(sendmax,num_send[i]);
+    sendmax_proc = MAX(sendmax_proc,num_send[i]);
  }
  // receive incoming messages
  // proc_recv = procs I recv from
  // num_recv = total size of message each proc sends me
-  // nrecvsize = total size of data I recv
+  // nrecvdatum = total size of data I recv
-  int nrecvsize = 0;
+  int nrecvdatum = 0;
-  for (i = 0; i < nrecv; i++) {
+  for (i = 0; i < nrecv_proc; i++) {
    MPI_Recv(&num_recv[i],1,MPI_INT,MPI_ANY_SOURCE,0,world,status);
    proc_recv[i] = status->MPI_SOURCE;
-    nrecvsize += num_recv[i];
+    nrecvdatum += num_recv[i];
  }
  nrecvdatum += num_self;
  // sort proc_recv and num_recv by proc ID if requested
  // useful for debugging to insure reproducible ordering of received datums
  if (sortflag) {
    int *order = new int[nrecv_proc];
    int *proc_recv_ordered = new int[nrecv_proc];
    int *num_recv_ordered = new int[nrecv_proc];
    for (i = 0; i < nrecv_proc; i++) order[i] = i;
    proc_recv_copy = proc_recv;
    qsort(order,nrecv_proc,sizeof(int),compare_standalone);
    int j;
    for (i = 0; i < nrecv_proc; i++) {
      j = order[i];
      proc_recv_ordered[i] = proc_recv[j];
      num_recv_ordered[i] = num_recv[j];
    }
    memcpy(proc_recv,proc_recv_ordered,nrecv_proc*sizeof(int));
    memcpy(num_recv,num_recv_ordered,nrecv_proc*sizeof(int));
    delete [] order;
    delete [] proc_recv_ordered;
    delete [] num_recv_ordered;
  }
  nrecvsize += num_self;
  // barrier to insure all MPI_ANY_SOURCE messages are received
  // else another proc could proceed to exchange_data() and send to me
  MPI_Barrier(world);
-  // free work vectors
+  // return # of datums I will receive
-  delete [] count;
+  return nrecvdatum;
  delete [] list;
  // initialize plan and return it
  dplan->nsend = nsend;
  dplan->nrecv = nrecv;
  dplan->sendmax = sendmax;
  dplan->proc_send = proc_send;
  dplan->num_send = num_send;
  dplan->index_send = index_send;
  dplan->proc_recv = proc_recv;
  dplan->num_recv = num_recv;
  dplan->num_self = num_self;
  dplan->index_self = index_self;
  dplan->request = request;
  dplan->status = status;
  return nrecvsize;
 }
 /* ----------------------------------------------------------------------
@ -667,49 +719,41 @@ int Irregular::create_data(int n, int *proclist)
 void Irregular::exchange_data(char *sendbuf, int nbytes, char *recvbuf)
 {
-  int i,m,n,offset,num_send;
+  int i,m,n,offset,count;
  // post all receives, starting after self copies
-  offset = dplan->num_self*nbytes;
+  offset = num_self*nbytes;
-  for (int irecv = 0; irecv < dplan->nrecv; irecv++) {
+  for (int irecv = 0; irecv < nrecv_proc; irecv++) {
-    MPI_Irecv(&recvbuf[offset],dplan->num_recv[irecv]*nbytes,MPI_CHAR,
+    MPI_Irecv(&recvbuf[offset],num_recv[irecv]*nbytes,MPI_CHAR,
-              dplan->proc_recv[irecv],0,world,&dplan->request[irecv]);
+              proc_recv[irecv],0,world,&request[irecv]);
-    offset += dplan->num_recv[irecv]*nbytes;
+    offset += num_recv[irecv]*nbytes;
  }
-  // allocate buf for largest send
+  // reallocate buf for largest send if necessary
-  char *buf;
+  if (sendmax_proc*nbytes > maxbuf) {
-  memory->create(buf,dplan->sendmax*nbytes,"irregular:buf");
+    memory->destroy(buf);
    maxbuf = sendmax_proc*nbytes;
    memory->create(buf,maxbuf,"irregular:buf");
  }
  // send each message
  // pack buf with list of datums
  // m = index of datum in sendbuf
  int *index_send = dplan->index_send;
  int nsend = dplan->nsend;
  n = 0;
-
+  for (int isend = 0; isend < nsend_proc; isend++) {
-  for (int isend = 0; isend < nsend; isend++) {
+    count = num_send[isend];
-    num_send = dplan->num_send[isend];
+    for (i = 0; i < count; i++) {
    for (i = 0; i < num_send; i++) {
      m = index_send[n++];
      memcpy(&buf[i*nbytes],&sendbuf[m*nbytes],nbytes);
    }
-    MPI_Send(buf,dplan->num_send[isend]*nbytes,MPI_CHAR,
+    MPI_Send(buf,count*nbytes,MPI_CHAR,proc_send[isend],0,world);
             dplan->proc_send[isend],0,world);
  }
  // free temporary send buffer
  memory->destroy(buf);
  // copy datums to self, put at beginning of recvbuf
  int *index_self = dplan->index_self;
  int num_self = dplan->num_self;
  for (i = 0; i < num_self; i++) {
    m = index_self[i];
    memcpy(&recvbuf[i*nbytes],&sendbuf[m*nbytes],nbytes);
@ -717,39 +761,37 @@ void Irregular::exchange_data(char *sendbuf, int nbytes, char *recvbuf)
  // wait on all incoming messages
-  if (dplan->nrecv) MPI_Waitall(dplan->nrecv,dplan->request,dplan->status);
+  if (nrecv_proc) MPI_Waitall(nrecv_proc,request,status);
 }
 /* ----------------------------------------------------------------------
-   destroy communication plan for datums
+   destroy vectors in communication plan for datums
 ------------------------------------------------------------------------- */
 void Irregular::destroy_data()
 {
-  delete [] dplan->proc_send;
+  delete [] proc_send;
-  delete [] dplan->num_send;
+  delete [] num_send;
-  delete [] dplan->index_send;
+  delete [] index_send;
-  delete [] dplan->proc_recv;
+  delete [] proc_recv;
-  delete [] dplan->num_recv;
+  delete [] num_recv;
-  delete [] dplan->index_self;
+  delete [] index_self;
-  delete [] dplan->request;
+  delete [] request;
-  delete [] dplan->status;
+  delete [] status;
  memory->sfree(dplan);
  dplan = NULL;
 }
 /* ----------------------------------------------------------------------
   determine which proc owns atom with coord x[3]
   x will be in box (orthogonal) or lamda coords (triclinic)
-   for uniform = 1, directly calculate owning proc
+   if layout = UNIFORM, calculate owning proc directly
-   for non-uniform, iteratively find owning proc via binary search
+   else layout = NONUNIFORM, iteratively find owning proc via binary search
   return owning proc ID via grid2proc
   return igx,igy,igz = logical grid loc of owing proc within 3d grid of procs
 ------------------------------------------------------------------------- */
 int Irregular::coord2proc(double *x, int &igx, int &igy, int &igz)
 {
-  if (uniform) {
+  if (layout == 0) {
    if (triclinic == 0) {
      igx = static_cast<int> (procgrid[0] * (x[0]-boxlo[0]) / prd[0]);
      igy = static_cast<int> (procgrid[1] * (x[1]-boxlo[1]) / prd[1]);
@ -846,7 +888,12 @@ void Irregular::grow_recv(int n)
 bigint Irregular::memory_usage()
 {
-  bigint bytes = memory->usage(buf_send,maxsend);
+  bigint bytes = 0;
-  bytes += memory->usage(buf_recv,maxrecv);
+  bytes += maxsend*sizeof(double);   // buf_send
  bytes += maxrecv*sizeof(double);   // buf_recv
  bytes += maxdbuf*sizeof(double);   // dbuf
  bytes += maxbuf;                   // buf
  bytes += 2*maxlocal*sizeof(int);   // mproclist,msizes
  bytes += 2*nprocs*sizeof(int);     // work1,work2
  return bytes;
 }
--- a/src/irregular.h
+++ b/src/irregular.h
@ -27,9 +27,9 @@ class Irregular : protected Pointers {
  Irregular(class LAMMPS *);
  ~Irregular();
-  void migrate_atoms(int sortflag = 0);
+  void migrate_atoms(int sortflag = 0, int *procassign = NULL);
  int migrate_check();
-  int create_data(int, int *);
+  int create_data(int, int *, int sortflag = 0);
  void exchange_data(char *, int, char *);
  void destroy_data();
  bigint memory_usage();
@ -38,58 +38,58 @@ class Irregular : protected Pointers {
  int me,nprocs;
  int triclinic;
  int map_style;
-  int uniform;
+  int layout;
  double *xsplit,*ysplit,*zsplit;   // ptrs to comm
  int *procgrid;                    // ptr to comm
  int ***grid2proc;                 // ptr to comm
  double *boxlo;                    // ptr to domain
  double *prd;                      // ptr to domain
-  int maxsend,maxrecv;              // size of buffers in # of doubles
+  int maxsend,maxrecv;              // size of buf send/recv in # of doubles
-  double *buf_send,*buf_recv;
+  double *buf_send,*buf_recv;       // bufs used in migrate_atoms
  int maxdbuf;                      // size of double buf in bytes
  double *dbuf;                     // double buf for largest single atom send
  int maxbuf;                       // size of char buf in bytes
  char *buf;                        // char buf for largest single data send
-  // plan for irregular communication of atoms
+  int *mproclist,*msizes;           // persistent vectors in migrate_atoms
  int maxlocal;                     // allocated size of mproclist and msizes
  int *work1,*work2;                // work vectors
  // plan params for irregular communication of atoms or datums
  // no params refer to atoms/data copied to self
  int nsend_proc;            // # of messages to send
  int nrecv_proc;            // # of messages to recv
  int sendmax_proc;          // # of doubles/datums in largest send message
  int *proc_send;            // list of procs to send to
  int *num_send;             // # of atoms/datums to send to each proc
  int *index_send;           // list of which atoms/datums to send to each proc
  int *proc_recv;            // list of procs to recv from
  MPI_Request *request;      // MPI requests for posted recvs
  MPI_Status *status;        // MPI statuses for WaitAll
  // extra plan params plan for irregular communication of atoms
  // no params refer to atoms copied to self
-  struct PlanAtom {
+  int *length_send;          // # of doubles to send to each proc
-    int nsend;                 // # of messages to send
+  int *length_recv;          // # of doubles to recv from each proc
-    int nrecv;                 // # of messages to recv
+  int *offset_send;          // where each atom starts in send buffer
    int sendmax;               // # of doubles in largest send message
    int *proc_send;            // procs to send to
    int *length_send;          // # of doubles to send to each proc
    int *num_send;             // # of atoms to send to each proc
    int *index_send;           // list of which atoms to send to each proc
    int *offset_send;          // where each atom starts in send buffer
    int *proc_recv;            // procs to recv from
    int *length_recv;          // # of doubles to recv from each proc
    MPI_Request *request;      // MPI requests for posted recvs
    MPI_Status *status;        // MPI statuses for WaitAll
  };
-  // plan for irregular communication of datums
+  // extra plan params plan for irregular communication of datums 
-  // only 2 self params refer to atoms copied to self
+  // 2 self params refer to data copied to self
-  struct PlanData {            // plan for irregular communication of data
+  int *num_recv;             // # of datums to recv from each proc
-    int nsend;                 // # of messages to send
+  int num_self;              // # of datums to copy to self
-    int nrecv;                 // # of messages to recv
+  int *index_self;           // list of which datums to copy to self
    int sendmax;               // # of datums in largest send message
    int *proc_send;            // procs to send to
    int *num_send;             // # of datums to send to each proc
    int *index_send;           // list of which datums to send to each proc
    int *proc_recv;            // procs to recv from
    int *num_recv;             // # of datums to recv from each proc
    int num_self;              // # of datums to copy to self
    int *index_self;           // list of which datums to copy to self
    MPI_Request *request;      // MPI requests for posted recvs
    MPI_Status *status;        // MPI statuses for WaitAll
  };
-  PlanAtom *aplan;
+  // private methods
  PlanData *dplan;
  int create_atom(int, int *, int *, int);
  void exchange_atom(double *, int *, double *);
  void destroy_atom();
  int coord2proc(double *, int &, int &, int &);
  int binary(double, int, double *);
--- a/src/rcb.cpp
+++ b/src/rcb.cpp
@ -0,0 +1,926 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #include "mpi.h"
 #include "string.h"
 #include "rcb.h"
 #include "irregular.h"
 #include "memory.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 #define MYHUGE 1.0e30
 #define TINY 1.0e-6
 // set this to bigger number after debugging
 #define DELTA 10
 // prototypes for non-class functions
 void box_merge(void *, void *, int *, MPI_Datatype *);
 void median_merge(void *, void *, int *, MPI_Datatype *);
 // NOTE: if want to have reuse flag, need to sum Tree across procs
 /* ---------------------------------------------------------------------- */
 RCB::RCB(LAMMPS *lmp) : Pointers(lmp)
 {
  MPI_Comm_rank(world,&me);
  MPI_Comm_size(world,&nprocs);
  ndot = maxdot = 0;
  dots = NULL;
  nlist = maxlist = 0;
  dotlist = dotmark = NULL;
  maxbuf = 0;
  buf = NULL;
  maxrecv = maxsend = 0;
  recvproc = recvindex = sendproc = sendindex = NULL;
  tree = NULL;
  irregular = NULL;
  // create MPI data and function types for box and median AllReduce ops
  MPI_Type_contiguous(6,MPI_DOUBLE,&box_type);
  MPI_Type_commit(&box_type);
  MPI_Type_contiguous(sizeof(Median),MPI_CHAR,&med_type);
  MPI_Type_commit(&med_type);
  MPI_Op_create(box_merge,1,&box_op);
  MPI_Op_create(median_merge,1,&med_op);
  reuse = 0;
 }
 /* ---------------------------------------------------------------------- */
 RCB::~RCB()
 {
  memory->sfree(dots);
  memory->destroy(dotlist);
  memory->destroy(dotmark);
  memory->sfree(buf);
  memory->destroy(recvproc);
  memory->destroy(recvindex);
  memory->destroy(sendproc);
  memory->destroy(sendindex);
  memory->sfree(tree);
  delete irregular;
  MPI_Type_free(&med_type);
  MPI_Type_free(&box_type);
  MPI_Op_free(&box_op);
  MPI_Op_free(&med_op);
 }
 /* ----------------------------------------------------------------------
   perform RCB balancing of N particles at coords X in bounding box LO/HI
   if wt = NULL, ignore per-particle weights
   if wt defined, per-particle weights > 0.0
   dimension = 2 or 3
   as documented in rcb.h:
     sets noriginal,nfinal,nkeep,recvproc,recvindex,lo,hi
   all proc particles will be inside or on surface of 3-d box
     defined by final lo/hi
   // NOTE: worry about re-use of data structs for fix balance
   // NOTE: should I get rid of wt all together, will it be used?
 ------------------------------------------------------------------------- */
 void RCB::compute(int dimension, int n, double **x, double *wt,
                  double *bboxlo, double *bboxhi)
 {
  int i,j,k;
  int keep,outgoing,incoming,incoming2;
  int dim,markactive;
  int indexlo,indexhi;
  int first_iteration,breakflag;
  double wttot,wtlo,wthi,wtsum,wtok,wtupto,wtmax;
  double targetlo,targethi;
  double valuemin,valuemax,valuehalf;
  double tolerance;
  MPI_Comm comm,comm_half;
  MPI_Request request,request2;
  MPI_Status status;
  Median med,medme;
  // create list of my Dots
  ndot = nkeep = noriginal = n;
  if (ndot > maxdot) {
    maxdot = ndot;
    memory->sfree(dots);
    dots = (Dot *) memory->smalloc(ndot*sizeof(Dot),"RCB:dots");
  }
  for (i = 0; i < ndot; i++) {
    dots[i].x[0] = x[i][0];
    dots[i].x[1] = x[i][1];
    dots[i].x[2] = x[i][2];
    dots[i].proc = me;
    dots[i].index = i;
  }
  if (wt)
    for (i = 0; i < ndot; i++) dots[i].wt = wt[i];
  else
    for (i = 0; i < ndot; i++) dots[i].wt = 1.0;
  // initial bounding box = simulation box
  // includes periodic or shrink-wrapped boundaries
  lo = bbox.lo;
  hi = bbox.hi;
  lo[0] = bboxlo[0];
  lo[1] = bboxlo[1];
  lo[2] = bboxlo[2];
  hi[0] = bboxhi[0];
  hi[1] = bboxhi[1];
  hi[2] = bboxhi[2];
  // initialize counters
  counters[0] = 0;
  counters[1] = 0;
  counters[2] = 0;
  counters[3] = ndot;
  counters[4] = maxdot;
  counters[5] = 0;
  counters[6] = 0;
  // create communicator for use in recursion
  MPI_Comm_dup(world,&comm);
  // recurse until partition is a single proc = me
  // proclower,procupper = lower,upper procs in partition
  // procmid = 1st proc in upper half of partition
  int procpartner,procpartner2;
  int readnumber;                   
  int procmid;
  int proclower = 0;
  int procupper = nprocs - 1;
  while (proclower != procupper) {
    // if odd # of procs, lower partition gets extra one
    procmid = proclower + (procupper - proclower) / 2 + 1;
    // determine communication partner(s)
    // readnumber = # of proc partners to read from
    if (me < procmid)
      procpartner = me + (procmid - proclower);
    else
      procpartner = me - (procmid - proclower);
    int readnumber = 1;
    if (procpartner > procupper) {
      readnumber = 0;
      procpartner--;
    }
    if (me == procupper && procpartner != procmid - 1) {
      readnumber = 2;
      procpartner2 = procpartner + 1;
    }
    // wttot = summed weight of entire partition
    // search tolerance = largest single weight (plus epsilon)
    // targetlo = desired weight in lower half of partition
    // targethi = desired weight in upper half of partition
    wtmax = wtsum = 0.0;
    if (wt) {
      for (i = 0; i < ndot; i++) {
        wtsum += dots[i].wt;
        if (dots[i].wt > wtmax) wtmax = dots[i].wt;
      }
    } else {
      for (i = 0; i < ndot; i++) wtsum += dots[i].wt;
      wtmax = 1.0;
    }
    MPI_Allreduce(&wtsum,&wttot,1,MPI_DOUBLE,MPI_SUM,comm);
    if (wt) MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,comm);
    else tolerance = 1.0;
    tolerance *= 1.0 + TINY;
    targetlo = wttot * (procmid - proclower) / (procupper + 1 - proclower);
    targethi = wttot - targetlo;
    // dim = dimension to bisect on
    // do not allow choice of z dimension for 2d system
    dim = 0;
    if (hi[1]-lo[1] > hi[0]-lo[0]) dim = 1;
    if (dimension == 3) {
      if (dim == 0 && hi[2]-lo[2] > hi[0]-lo[0]) dim = 2;
      if (dim == 1 && hi[2]-lo[2] > hi[1]-lo[1]) dim = 2;
    }
    // create active list and mark array for dots
    // initialize active list to all dots
    if (ndot > maxlist) {
      memory->destroy(dotlist);
      memory->destroy(dotmark);
      maxlist = maxdot;
      memory->create(dotlist,maxlist,"RCB:dotlist");
      memory->create(dotmark,maxlist,"RCB:dotmark");
    }
    nlist = ndot;
    for (i = 0; i < nlist; i++) dotlist[i] = i;
    // median iteration
    // zoom in on bisector until correct # of dots in each half of partition
    // as each iteration of median-loop begins, require:
    //   all non-active dots are marked with 0/1 in dotmark
    //   valuemin <= every active dot <= valuemax
    //   wtlo, wthi = total wt of non-active dots
    // when leave median-loop, require only:
    //   valuehalf = correct cut position
    //   all dots <= valuehalf are marked with 0 in dotmark
    //   all dots >= valuehalf are marked with 1 in dotmark
    // markactive = which side of cut is active = 0/1
    // indexlo,indexhi = indices of dot closest to median
    wtlo = wthi = 0.0;
    valuemin = lo[dim];
    valuemax = hi[dim];
    first_iteration = 1;
    while (1) {
      // choose bisector value
      // use old value on 1st iteration if old cut dimension is the same
      // on 2nd option: could push valuehalf towards geometric center 
      //   with "1.0-factor" to force overshoot
      if (first_iteration && reuse && dim == tree[procmid].dim) {
 	counters[5]++;
 	valuehalf = tree[procmid].cut;
 	if (valuehalf < valuemin || valuehalf > valuemax)
 	  valuehalf = 0.5 * (valuemin + valuemax);	  
      } else if (wt)
 	valuehalf = valuemin + (targetlo - wtlo) /
 	  (wttot - wtlo - wthi) * (valuemax - valuemin);
      else
 	valuehalf = 0.5 * (valuemin + valuemax);
      first_iteration = 0;
      // initialize local median data structure
      medme.totallo = medme.totalhi = 0.0;
      medme.valuelo = -MYHUGE;
      medme.valuehi = MYHUGE;
      medme.wtlo = medme.wthi = 0.0;
      medme.countlo = medme.counthi = 0;
      medme.proclo = medme.prochi = me;
      // mark all active dots on one side or other of bisector
      // also set all fields in median data struct
      // save indices of closest dots on either side
      for (j = 0; j < nlist; j++) {
 	i = dotlist[j];
 	if (dots[i].x[dim] <= valuehalf) {            // in lower part
 	  medme.totallo += dots[i].wt;
 	  dotmark[i] = 0;
 	  if (dots[i].x[dim] > medme.valuelo) {       // my closest dot
 	    medme.valuelo = dots[i].x[dim];
 	    medme.wtlo = dots[i].wt;
 	    medme.countlo = 1;
 	    indexlo = i;
 	  } else if (dots[i].x[dim] == medme.valuelo) {   // tied for closest
 	    medme.wtlo += dots[i].wt;
 	    medme.countlo++;
 	  }
 	}
 	else {                                        // in upper part
 	  medme.totalhi += dots[i].wt;
 	  dotmark[i] = 1;
 	  if (dots[i].x[dim] < medme.valuehi) {       // my closest dot
 	    medme.valuehi = dots[i].x[dim];
 	    medme.wthi = dots[i].wt;
 	    medme.counthi = 1;
 	    indexhi = i;
 	  } else if (dots[i].x[dim] == medme.valuehi) {   // tied for closest
 	    medme.wthi += dots[i].wt;
 	    medme.counthi++;
 	  }
 	}
      }
      // combine median data struct across current subset of procs
      counters[0]++;
      MPI_Allreduce(&medme,&med,1,med_type,med_op,comm);
      // test median guess for convergence
      // move additional dots that are next to cut across it
      if (wtlo + med.totallo < targetlo) {    // lower half TOO SMALL
 	wtlo += med.totallo;
 	valuehalf = med.valuehi;
 	if (med.counthi == 1) {                  // only one dot to move
 	  if (wtlo + med.wthi < targetlo) {  // move it, keep iterating
 	    if (me == med.prochi) dotmark[indexhi] = 0;
 	  }
 	  else {                                 // only move if beneficial
 	    if (wtlo + med.wthi - targetlo < targetlo - wtlo)
 	      if (me == med.prochi) dotmark[indexhi] = 0;
 	    break;                               // all done
 	  }
 	}
 	else {                                   // multiple dots to move
 	  breakflag = 0;
 	  wtok = 0.0;
 	  if (medme.valuehi == med.valuehi) wtok = medme.wthi;   
 	  if (wtlo + med.wthi >= targetlo) {                // all done
 	    MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
 	    wtmax = targetlo - wtlo;
 	    if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
 	    breakflag = 1;
 	  }                                      // wtok = most I can move
 	  for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
 	    i = dotlist[j];
 	    if (dots[i].x[dim] == med.valuehi) { // only move if better
 	      if (wtsum + dots[i].wt - wtok < wtok - wtsum)
 		dotmark[i] = 0;
 	      wtsum += dots[i].wt;
 	    }
 	  }
 	  if (breakflag) break;                   // done if moved enough
 	}
 	wtlo += med.wthi;
 	if (targetlo-wtlo <= tolerance) break;  // close enough
 	valuemin = med.valuehi;                   // iterate again
 	markactive = 1;
      }
      else if (wthi + med.totalhi < targethi) {  // upper half TOO SMALL
 	wthi += med.totalhi;
 	valuehalf = med.valuelo;
 	if (med.countlo == 1) {                  // only one dot to move
 	  if (wthi + med.wtlo < targethi) {  // move it, keep iterating
 	    if (me == med.proclo) dotmark[indexlo] = 1;
 	  }
 	  else {                                 // only move if beneficial
 	    if (wthi + med.wtlo - targethi < targethi - wthi)
 	      if (me == med.proclo) dotmark[indexlo] = 1;
 	    break;                               // all done
 	  }
 	}
 	else {                                   // multiple dots to move
 	  breakflag = 0;
 	  wtok = 0.0;
 	  if (medme.valuelo == med.valuelo) wtok = medme.wtlo;   
 	  if (wthi + med.wtlo >= targethi) {                // all done
 	    MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
 	    wtmax = targethi - wthi;
 	    if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
 	    breakflag = 1;
 	  }                                      // wtok = most I can move
 	  for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
 	    i = dotlist[j];
 	    if (dots[i].x[dim] == med.valuelo) { // only move if better
 	      if (wtsum + dots[i].wt - wtok < wtok - wtsum) 
 		dotmark[i] = 1;
 	      wtsum += dots[i].wt;
 	    }
 	  }
 	  if (breakflag) break;                   // done if moved enough
 	}
 	wthi += med.wtlo;
 	if (targethi-wthi <= tolerance) break;  // close enough
 	valuemax = med.valuelo;                   // iterate again
 	markactive = 0;
      }
      else                  // Goldilocks result: both partitions just right
 	break;
      // shrink the active list
      k = 0;
      for (j = 0; j < nlist; j++) {
 	i = dotlist[j];
 	if (dotmark[i] == markactive) dotlist[k++] = i;
      }
      nlist = k;
    }
    // found median
    // store cut info only if I am procmid
    if (me == procmid) {
      cut = valuehalf;
      cutdim = dim;
    }
    // use cut to shrink my RCB bounding box
    if (me < procmid) hi[dim] = valuehalf;
    else lo[dim] = valuehalf;
    // outgoing = number of dots to ship to partner
    // nkeep = number of dots that have never migrated
    markactive = (me < procpartner);
    for (i = 0, keep = 0, outgoing = 0; i < ndot; i++)
      if (dotmark[i] == markactive) outgoing++;
      else if (i < nkeep) keep++;
    nkeep = keep;
    // alert partner how many dots I'll send, read how many I'll recv
    MPI_Send(&outgoing,1,MPI_INT,procpartner,0,world);
    incoming = 0;
    if (readnumber) {
      MPI_Recv(&incoming,1,MPI_INT,procpartner,0,world,&status);
      if (readnumber == 2) {
 	MPI_Recv(&incoming2,1,MPI_INT,procpartner2,0,world,&status);
 	incoming += incoming2;
      }
    }
    // check if need to alloc more space
    int ndotnew = ndot - outgoing + incoming;
    if (ndotnew > maxdot) {
      while (maxdot < ndotnew) maxdot += DELTA;
      dots = (Dot *) memory->srealloc(dots,maxdot*sizeof(Dot),"RCB::dots");
      counters[6]++;
    }
    counters[1] += outgoing;
    counters[2] += incoming;
    if (ndotnew > counters[3]) counters[3] = ndotnew;
    if (maxdot > counters[4]) counters[4] = maxdot;
    // malloc comm send buffer
    if (outgoing > maxbuf) {
      memory->sfree(buf);
      maxbuf = outgoing;
      buf = (Dot *) memory->smalloc(maxbuf*sizeof(Dot),"RCB:buf");
    }
    // fill buffer with dots that are marked for sending
    // pack down the unmarked ones
    keep = outgoing = 0;
    for (i = 0; i < ndot; i++) {
      if (dotmark[i] == markactive)
 	memcpy(&buf[outgoing++],&dots[i],sizeof(Dot));
      else
 	memcpy(&dots[keep++],&dots[i],sizeof(Dot));
    }
    // post receives for dots
    if (readnumber > 0) {
      MPI_Irecv(&dots[keep],incoming*sizeof(Dot),MPI_CHAR,
                procpartner,1,world,&request);
      if (readnumber == 2) {
 	keep += incoming - incoming2;
 	MPI_Irecv(&dots[keep],incoming2*sizeof(Dot),MPI_CHAR,
                  procpartner2,1,world,&request2);
      }
    }
    // handshake before sending dots to insure recvs have been posted
    if (readnumber > 0) {
      MPI_Send(NULL,0,MPI_INT,procpartner,0,world);
      if (readnumber == 2) MPI_Send(NULL,0,MPI_INT,procpartner2,0,world);
    }
    MPI_Recv(NULL,0,MPI_INT,procpartner,0,world,&status);
    // send dots to partner
    MPI_Rsend(buf,outgoing*sizeof(Dot),MPI_CHAR,procpartner,1,world);
    // wait until all dots are received
    if (readnumber > 0) {
      MPI_Wait(&request,&status);
      if (readnumber == 2) MPI_Wait(&request2,&status);
    }
    ndot = ndotnew;
    // cut partition in half, create new communicators of 1/2 size
    int split;
    if (me < procmid) {
      procupper = procmid - 1;
      split = 0;
    } else {
      proclower = procmid;
      split = 1;
    }
    MPI_Comm_split(comm,split,me,&comm_half);
    MPI_Comm_free(&comm);
    comm = comm_half;
  }
  // clean up
  MPI_Comm_free(&comm);
  // set public variables with results of rebalance
  nfinal = ndot;
  if (nfinal > maxrecv) {
    memory->destroy(recvproc);
    memory->destroy(recvindex);
    maxrecv = nfinal;
    memory->create(recvproc,maxrecv,"RCB:recvproc");
    memory->create(recvindex,maxrecv,"RCB:recvindex");
  }
  for (i = 0; i < nfinal; i++) {
    recvproc[i] = dots[i].proc;
    recvindex[i] = dots[i].index;
  }
 }
 /* ----------------------------------------------------------------------
   custom MPI reduce operation
   merge of each component of an RCB bounding box
 ------------------------------------------------------------------------- */
 void box_merge(void *in, void *inout, int *len, MPI_Datatype *dptr)
 {
  RCB::BBox *box1 = (RCB::BBox *) in;
  RCB::BBox *box2 = (RCB::BBox *) inout;
  for (int i = 0; i < 3; i++) {
    if (box1->lo[i] < box2->lo[i]) box2->lo[i] = box1->lo[i];
    if (box1->hi[i] > box2->hi[i]) box2->hi[i] = box1->hi[i];
  }
 }
 /* ----------------------------------------------------------------------
   custom MPI reduce operation
   merge median data structure
   on input:
     in,inout->totallo, totalhi = weight in both partitions on this proc
               valuelo, valuehi = pos of nearest dot(s) to cut on this proc
               wtlo, wthi       = total wt of dot(s) at that pos on this proc
               countlo, counthi = # of dot(s) nearest to cut on this proc
               proclo, prochi   = not used
   on exit:
     inout->   totallo, totalhi = total # of active dots in both partitions
               valuelo, valuehi = pos of nearest dot(s) to cut
               wtlo, wthi       = total wt of dot(s) at that position
               countlo, counthi = total # of dot(s) nearest to cut
               proclo, prochi   = one unique proc who owns a nearest dot
 	                          all procs must get same proclo,prochi
 ------------------------------------------------------------------------- */
 void median_merge(void *in, void *inout, int *len, MPI_Datatype *dptr)
 {
  RCB::Median *med1 = (RCB::Median *) in;
  RCB::Median *med2 = (RCB::Median *) inout;
  med2->totallo += med1->totallo;
  if (med1->valuelo > med2->valuelo) {
    med2->valuelo = med1->valuelo;
    med2->wtlo = med1->wtlo;
    med2->countlo = med1->countlo;
    med2->proclo = med1->proclo;
  }
  else if (med1->valuelo == med2->valuelo) {
    med2->wtlo += med1->wtlo;
    med2->countlo += med1->countlo;
    if (med1->proclo < med2->proclo) med2->proclo = med1->proclo;
  }
  med2->totalhi += med1->totalhi;
  if (med1->valuehi < med2->valuehi) {
    med2->valuehi = med1->valuehi;
    med2->wthi = med1->wthi;
    med2->counthi = med1->counthi;
    med2->prochi = med1->prochi;
  }
  else if (med1->valuehi == med2->valuehi) {
    med2->wthi += med1->wthi;
    med2->counthi += med1->counthi;
    if (med1->prochi < med2->prochi) med2->prochi = med1->prochi;
  }
 }
 /* ----------------------------------------------------------------------
   invert the RCB rebalance result to convert receive info into send info
   sortflag = flag for sorting order of received messages by proc ID
 ------------------------------------------------------------------------- */
 void RCB::invert(int sortflag)
 {
  Invert *sbuf,*rbuf;
  // only create Irregular if not previously created
  // allows Irregular to persist for multiple RCB calls by fix balance
  if (!irregular) irregular = new Irregular(lmp);
  // nsend = # of dots to request from other procs
  int nsend = nfinal-nkeep;
  int *proclist;
  memory->create(proclist,nsend,"RCB:proclist");
  Invert *sinvert = 
    (Invert *) memory->smalloc(nsend*sizeof(Invert),"RCB:sinvert");
  int m = 0;
  for (int i = nkeep; i < nfinal; i++) {
    proclist[m] = recvproc[i];
    sinvert[m].rindex = recvindex[i];
    sinvert[m].sproc = me;
    sinvert[m].sindex = i;
    m++;
  }
  // perform inversion via irregular comm
  // nrecv = # of my dots to send to other procs
  int nrecv = irregular->create_data(nsend,proclist,sortflag);
  Invert *rinvert = 
    (Invert *) memory->smalloc(nrecv*sizeof(Invert),"RCB:rinvert");
  irregular->exchange_data((char *) sinvert,sizeof(Invert),(char *) rinvert);
  irregular->destroy_data();
  // set public variables from requests to send my dots
  if (noriginal > maxsend) {
    memory->destroy(sendproc);
    memory->destroy(sendindex);
    maxsend = noriginal;
    memory->create(sendproc,maxsend,"RCB:sendproc");
    memory->create(sendindex,maxsend,"RCB:sendindex");
  }
  for (int i = 0; i < nkeep; i++) {
    sendproc[recvindex[i]] = me;
    sendindex[recvindex[i]] = i;
  }
  for (int i = 0; i < nrecv; i++) {
    m = rinvert[i].rindex;
    sendproc[m] = rinvert[i].sproc;
    sendindex[m] = rinvert[i].sindex;
  }
  // clean-up
  memory->destroy(proclist);
  memory->destroy(sinvert);
  memory->destroy(rinvert);
 }
 /* ----------------------------------------------------------------------
   memory use of Irregular
 ------------------------------------------------------------------------- */
 bigint RCB::memory_usage()
 {
  bigint bytes = 0;
  if (irregular) bytes += irregular->memory_usage();
  return bytes;
 }
 // -----------------------------------------------------------------------
 // DEBUG methods
 // -----------------------------------------------------------------------
 /*
 // consistency checks on RCB results
 void RCB::check()
 {
  int i,iflag,total1,total2;
  double weight,wtmax,wtmin,wtone,tolerance;
  // check that total # of dots remained the same
  MPI_Allreduce(&ndotorig,&total1,1,MPI_INT,MPI_SUM,world);
  MPI_Allreduce(&ndot,&total2,1,MPI_INT,MPI_SUM,world);
  if (total1 != total2) {
    if (me == 0) 
      printf("ERROR: Points before RCB = %d, Points after RCB = %d\n",
 	     total1,total2);
  }
  // check that result is load-balanced within log2(P)*max-wt
  weight = wtone = 0.0;
  for (i = 0; i < ndot; i++) {
    weight += dots[i].wt;
    if (dots[i].wt > wtone) wtone = dots[i].wt;
  }
  MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
  MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
  MPI_Allreduce(&wtone,&tolerance,1,MPI_DOUBLE,MPI_MAX,world);
  // i = smallest power-of-2 >= nprocs
  // tolerance = largest-single-weight*log2(nprocs)
  for (i = 0; (nprocs >> i) != 0; i++);
  tolerance = tolerance * i * (1.0 + TINY);
  if (wtmax - wtmin > tolerance) {
    if (me == 0) 
      printf("ERROR: Load-imbalance > tolerance of %g\n",tolerance);
    MPI_Barrier(world);
    if (weight == wtmin) printf("  Proc %d has weight = %g\n",me,weight);
    if (weight == wtmax) printf("  Proc %d has weight = %g\n",me,weight);
  }
  MPI_Barrier(world);
  // check that final set of points is inside RCB box of each proc
  iflag = 0;
  for (i = 0; i < ndot; i++) {
    if (dots[i].x[0] < lo[0] || dots[i].x[0] > hi[0] ||
 	dots[i].x[1] < lo[1] || dots[i].x[1] > hi[1] ||
 	dots[i].x[2] < lo[2] || dots[i].x[2] > hi[2])
      iflag++;
  }
  if (iflag > 0) 
    printf("ERROR: %d points are out-of-box on proc %d\n",iflag,me);
 }
 // stats for RCB decomposition
 void RCB::stats(int flag)
 {
  int i,iflag,sum,min,max;
  double ave,rsum,rmin,rmax;
  double weight,wttot,wtmin,wtmax;
  if (me == 0) printf("RCB Statistics:\n");
  // distribution info
  for (i = 0, weight = 0.0; i < ndot; i++) weight += dots[i].wt;
  MPI_Allreduce(&weight,&wttot,1,MPI_DOUBLE,MPI_SUM,world);
  MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
  MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
  if (me == 0) {
    printf(" Total weight of dots = %g\n",wttot);
    printf(" Weight on each proc: ave = %g, max = %g, min = %g\n",
 	   wttot/nprocs,wtmax,wtmin);
  }
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d has weight = %g\n",me,weight);
  }
  for (i = 0, weight = 0.0; i < ndot; i++) 
    if (dots[i].wt > weight) weight = dots[i].wt;
  MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
  if (me == 0) printf(" Maximum weight of single dot = %g\n",wtmax);
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d max weight = %g\n",me,weight);
  }
  // counter info
  MPI_Allreduce(&counters[0],&sum,1,MPI_INT,MPI_SUM,world);
  MPI_Allreduce(&counters[0],&min,1,MPI_INT,MPI_MIN,world);
  MPI_Allreduce(&counters[0],&max,1,MPI_INT,MPI_MAX,world);
  ave = ((double) sum)/nprocs;
  if (me == 0) 
    printf(" Median iter: ave = %g, min = %d, max = %d\n",ave,min,max);
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d median count = %d\n",me,counters[0]);
  }
  MPI_Allreduce(&counters[1],&sum,1,MPI_INT,MPI_SUM,world);
  MPI_Allreduce(&counters[1],&min,1,MPI_INT,MPI_MIN,world);
  MPI_Allreduce(&counters[1],&max,1,MPI_INT,MPI_MAX,world);
  ave = ((double) sum)/nprocs;
  if (me == 0) 
    printf(" Send count: ave = %g, min = %d, max = %d\n",ave,min,max);
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d send count = %d\n",me,counters[1]);
  }
  MPI_Allreduce(&counters[2],&sum,1,MPI_INT,MPI_SUM,world);
  MPI_Allreduce(&counters[2],&min,1,MPI_INT,MPI_MIN,world);
  MPI_Allreduce(&counters[2],&max,1,MPI_INT,MPI_MAX,world);
  ave = ((double) sum)/nprocs;
  if (me == 0) 
    printf(" Recv count: ave = %g, min = %d, max = %d\n",ave,min,max);
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d recv count = %d\n",me,counters[2]);
  }
  MPI_Allreduce(&counters[3],&sum,1,MPI_INT,MPI_SUM,world);
  MPI_Allreduce(&counters[3],&min,1,MPI_INT,MPI_MIN,world);
  MPI_Allreduce(&counters[3],&max,1,MPI_INT,MPI_MAX,world);
  ave = ((double) sum)/nprocs;
  if (me == 0) 
    printf(" Max dots: ave = %g, min = %d, max = %d\n",ave,min,max);
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d max dots = %d\n",me,counters[3]);
  }
  MPI_Allreduce(&counters[4],&sum,1,MPI_INT,MPI_SUM,world);
  MPI_Allreduce(&counters[4],&min,1,MPI_INT,MPI_MIN,world);
  MPI_Allreduce(&counters[4],&max,1,MPI_INT,MPI_MAX,world);
  ave = ((double) sum)/nprocs;
  if (me == 0) 
    printf(" Max memory: ave = %g, min = %d, max = %d\n",ave,min,max);
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d max memory = %d\n",me,counters[4]);
  }
  if (reuse) {
    MPI_Allreduce(&counters[5],&sum,1,MPI_INT,MPI_SUM,world);
    MPI_Allreduce(&counters[5],&min,1,MPI_INT,MPI_MIN,world);
    MPI_Allreduce(&counters[5],&max,1,MPI_INT,MPI_MAX,world);
    ave = ((double) sum)/nprocs;
    if (me == 0) 
      printf(" # of Reuse: ave = %g, min = %d, max = %d\n",ave,min,max);
    if (flag) {
      MPI_Barrier(world);
      printf("    Proc %d # of Reuse = %d\n",me,counters[5]);
    }
  }
  MPI_Allreduce(&counters[6],&sum,1,MPI_INT,MPI_SUM,world);
  MPI_Allreduce(&counters[6],&min,1,MPI_INT,MPI_MIN,world);
  MPI_Allreduce(&counters[6],&max,1,MPI_INT,MPI_MAX,world);
  ave = ((double) sum)/nprocs;
  if (me == 0) 
    printf(" # of OverAlloc: ave = %g, min = %d, max = %d\n",ave,min,max);
  if (flag) {
    MPI_Barrier(world);
    printf("    Proc %d # of OverAlloc = %d\n",me,counters[6]);
  }
  // RCB boxes for each proc
  if (flag) {
    if (me == 0) printf(" RCB sub-domain boxes:\n");
    for (i = 0; i < 3; i++) {
      MPI_Barrier(world);
      if (me == 0) printf("    Dimension %d\n",i+1);
      MPI_Barrier(world);
      printf("      Proc = %d: Box = %g %g\n",me,lo[i],hi[i]);
    }
  }
 }
 */
--- a/src/rcb.h
+++ b/src/rcb.h
@ -0,0 +1,131 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifndef LAMMPS_RCB_H
 #define LAMMPS_RCB_H
 #include "mpi.h"
 #include "pointers.h"
 namespace LAMMPS_NS {
 class RCB : protected Pointers {
 public:
  // set by compute()
  int noriginal;              // # of dots I own before balancing
  int nfinal;                 // # of dots I own after balancing
  int nkeep;                  // how many dots of noriginal I still own
                              // will be first nkept of nfinal list
  int *recvproc;              // proc IDs of nfinal dots
  int *recvindex;             // index of nfinal dots on owning procs
                              // based on input list for compute()
  double *lo,*hi;             // final bounding box of my RCB sub-domain
  double cut;                 // single cut (in Tree) owned by this proc
  int cutdim;                 // dimension (0,1,2) of the cut
  // set by invert()
  int *sendproc;              // proc to send each of my noriginal dots to
  int *sendindex;             // index of dot in receiver's nfinal list
  RCB(class LAMMPS *);
  ~RCB();
  void compute(int, int, double **, double *, double *, double *);
  void invert(int sortflag = 0);
  bigint memory_usage();
  // DEBUG methods
  //void check();
  //void stats(int);
  // RCB cut info
  struct Median {
    double totallo,totalhi;   // weight in each half of active partition
    double valuelo,valuehi;   // position of dot(s) nearest to cut
    double wtlo,wthi;         // total weight of dot(s) at that position
    int countlo,counthi;      // # of dots at that position
    int proclo,prochi;	      // unique proc who owns a nearest dot
  };
  struct BBox {
    double lo[3],hi[3];       // corner points of a bounding box
  };
 private:
  int me,nprocs;
  // point to balance on
  struct Dot {
    double x[3];          // coord of point
    double wt;            // weight of point
    int proc;             // owning proc
    int index;            // index on owning proc
  };
  // tree of RCB cuts
  struct Tree {
    double cut;        	// position of cut
    int dim;	        // dimension = 0/1/2 of cut
  };
  // inversion message
  struct Invert {
    int rindex;         // index on receiving proc
    int sproc;          // sending proc
    int sindex;         // index on sending proc
  };
  Dot *dots;        // dots on this proc
  int ndot;         // # of dots on this proc
  int maxdot;       // allocated size of dots
  int ndotorig;
  int nlist;
  int maxlist;
  int *dotlist;
  int *dotmark;
  int maxbuf;
  Dot *buf;
  int maxrecv,maxsend;
  BBox bbox;
  class Irregular *irregular;
  MPI_Op box_op,med_op;
  MPI_Datatype box_type,med_type;
  int reuse;        // 1/0 to use/not use previous cuts
  int dottop;       // dots >= this index are new
  double bboxlo[3]; // bounding box of final RCB sub-domain
  double bboxhi[3]; 
  Tree *tree;       // tree of RCB cuts, used by reuse()
  int counters[7];  // diagnostic counts
 		    // 0 = # of median iterations
 		    // 1 = # of points sent
 		    // 2 = # of points received
 		    // 3 = most points this proc ever owns
 		    // 4 = most point memory this proc ever allocs
 		    // 5 = # of times a previous cut is re-used
 		    // 6 = # of reallocs of point vector
 };
 }
 #endif
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@ -29,6 +29,8 @@ using namespace LAMMPS_NS;
 #define LB_FACTOR 1.1
 #define EPSILON   1.0e-6
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 /* ---------------------------------------------------------------------- */
 Replicate::Replicate(LAMMPS *lmp) : Pointers(lmp) {}
@ -220,17 +222,33 @@ void Replicate::command(int narg, char **arg)
    sublo[2] = domain->sublo_lamda[2]; subhi[2] = domain->subhi_lamda[2];
  }
-  if (domain->xperiodic) {
+  if (comm->layout != LAYOUT_TILED) {
-    if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
+    if (domain->xperiodic) {
-    if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
+      if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
-  }
+      if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
-  if (domain->yperiodic) {
+    }
-    if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
+    if (domain->yperiodic) {
-    if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
+      if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
-  }
+      if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
-  if (domain->zperiodic) {
+    }
-    if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
+    if (domain->zperiodic) {
-    if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
+      if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
      if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
    }
  } else {
    if (domain->xperiodic) {
      if (comm->mysplit[0][0] == 0.0) sublo[0] -= epsilon[0];
      if (comm->mysplit[0][1] == 1.0) subhi[0] += epsilon[0];
    }
    if (domain->yperiodic) {
      if (comm->mysplit[1][0] == 0.0) sublo[1] -= epsilon[1];
      if (comm->mysplit[1][1] == 1.0) subhi[1] += epsilon[1];
    }
    if (domain->zperiodic) {
      if (comm->mysplit[2][0] == 0.0) sublo[2] -= epsilon[2];
      if (comm->mysplit[2][1] == 1.0) subhi[2] += epsilon[2];
    }
  }
  // loop over all procs