git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7734 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-02-09 16:56:02 +00:00
parent 2ab478095e
commit d892afbf67
8 changed files with 1235 additions and 199 deletions
--- a/src/balance.cpp
+++ b/src/balance.cpp
@ -0,0 +1,728 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "math.h"
+#include "stdlib.h"
+#include "string.h"
+#include "balance.h"
+#include "atom.h"
+#include "comm.h"
+#include "irregular.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+enum{NONE,UNIFORM,USER,DYNAMIC};
+enum{X,Y,Z};
+enum{EXPAND,CONTRACT};
+
+/* ---------------------------------------------------------------------- */
+
+Balance::Balance(LAMMPS *lmp) : Pointers(lmp)
+{
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  memory->create(pcount,nprocs,"balance:pcount");
+  memory->create(allcount,nprocs,"balance:allcount");
+
+  user_xsplit = user_ysplit = user_zsplit = NULL;
+  dflag = 0;
+
+  fp = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+Balance::~Balance()
+{
+  memory->destroy(pcount);
+  memory->destroy(allcount);
+
+  delete [] user_xsplit;
+  delete [] user_ysplit;
+  delete [] user_zsplit;
+
+  if (dflag) {
+    delete [] bstr;
+    delete [] ops;
+    delete [] counts[0];
+    delete [] counts[1];
+    delete [] counts[2];
+    delete [] cuts;
+    delete [] onecount;
+    //MPI_Comm_free(&commslice[0]);
+    //MPI_Comm_free(&commslice[1]);
+    //MPI_Comm_free(&commslice[2]);
+  }
+
+  if (fp) fclose(fp);
+}
+
+/* ----------------------------------------------------------------------
+   called as balance command in input script
+------------------------------------------------------------------------- */
+
+void Balance::command(int narg, char **arg)
+{
+  if (domain->box_exist == 0)
+    error->all(FLERR,"Balance command before simulation box is defined");
+
+  if (comm->me == 0 && screen) fprintf(screen,"Balancing ...\n");
+
+  // parse arguments
+
+  int dimension = domain->dimension;
+  int *procgrid = comm->procgrid;
+  xflag = yflag = zflag = NONE;
+  dflag = 0;
+
+  int iarg = 0;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"x") == 0) {
+      if (xflag == DYNAMIC) error->all(FLERR,"Illegal balance command");
+      if (strcmp(arg[iarg+1],"uniform") == 0) {
+	if (iarg+2 > narg) error->all(FLERR,"Illegal balance command");
+	xflag = UNIFORM;
+	iarg += 2;
+      } else {
+	if (1 + procgrid[0]-1 > narg)
+	  error->all(FLERR,"Illegal balance command");
+	xflag = USER;
+	delete [] user_xsplit;
+	user_xsplit = new double[procgrid[0]+1];
+	user_xsplit[0] = 0.0;
+	iarg++;
+	for (int i = 1; i < procgrid[0]; i++)
+	  user_xsplit[i] = force->numeric(arg[iarg++]);
+	user_xsplit[procgrid[0]] = 1.0;
+      }
+    } else if (strcmp(arg[iarg],"y") == 0) {
+      if (yflag == DYNAMIC) error->all(FLERR,"Illegal balance command");
+      if (strcmp(arg[iarg+1],"uniform") == 0) {
+	if (iarg+2 > narg) error->all(FLERR,"Illegal balance command");
+	yflag = UNIFORM;
+	iarg += 2;
+      } else {
+	if (1 + procgrid[1]-1 > narg)
+	  error->all(FLERR,"Illegal balance command");
+	yflag = USER;
+	delete [] user_ysplit;
+	user_ysplit = new double[procgrid[1]+1];
+	user_ysplit[0] = 0.0;
+	iarg++;
+	for (int i = 1; i < procgrid[1]; i++)
+	  user_ysplit[i] = force->numeric(arg[iarg++]);
+	user_ysplit[procgrid[1]] = 1.0;
+      }
+    } else if (strcmp(arg[iarg],"z") == 0) {
+      if (zflag == DYNAMIC) error->all(FLERR,"Illegal balance command");
+      if (strcmp(arg[iarg+1],"uniform") == 0) {
+	if (iarg+2 > narg) error->all(FLERR,"Illegal balance command");
+	zflag = UNIFORM;
+	iarg += 2;
+      } else {
+	if (1 + procgrid[2]-1 > narg)
+	  error->all(FLERR,"Illegal balance command");
+	zflag = USER;
+	delete [] user_zsplit;
+	user_zsplit = new double[procgrid[2]+1];
+	user_zsplit[0] = 0.0;
+	iarg++;
+	for (int i = 1; i < procgrid[2]; i++)
+	  user_zsplit[i] = force->numeric(arg[iarg++]);
+	user_zsplit[procgrid[2]] = 1.0;
+      }
+
+    } else if (strcmp(arg[iarg],"dynamic") == 0) {
+      if (xflag != NONE || yflag != NONE || zflag != NONE)
+	error->all(FLERR,"Illegal balance command");
+      if (iarg+5 > narg) error->all(FLERR,"Illegal balance command");
+      dflag = 1;
+      xflag = yflag = DYNAMIC;
+      if (dimension == 3) zflag = DYNAMIC;
+      nrepeat = atoi(arg[iarg+1]);
+      niter = atoi(arg[iarg+2]);
+      if (nrepeat <= 0 || niter <= 0)
+	error->all(FLERR,"Illegal balance command");
+      int n = strlen(arg[iarg+3]) + 1;
+      bstr = new char[n];
+      strcpy(bstr,arg[iarg+3]);
+      thresh = atof(arg[iarg+4]);
+      if (thresh < 1.0) error->all(FLERR,"Illegal balance command");
+      iarg += 5;
+
+    } else error->all(FLERR,"Illegal balance command");
+  }
+
+  // error check
+
+  if (zflag && dimension == 2)
+    error->all(FLERR,"Cannot balance in z dimension for 2d simulation");
+
+  if (xflag == USER)
+    for (int i = 1; i <= procgrid[0]; i++)
+      if (user_xsplit[i-1] >= user_xsplit[i]) 
+	error->all(FLERR,"Illegal balance command");
+  if (yflag == USER)
+    for (int i = 1; i <= procgrid[1]; i++)
+      if (user_ysplit[i-1] >= user_ysplit[i]) 
+	error->all(FLERR,"Illegal balance command");
+  if (zflag == USER)
+    for (int i = 1; i <= procgrid[2]; i++)
+      if (user_zsplit[i-1] >= user_zsplit[i]) 
+	error->all(FLERR,"Illegal balance command");
+
+  if (dflag) {
+    for (int i = 0; i < strlen(bstr); i++) {
+      if (bstr[i] != 'x' && bstr[i] != 'y' && bstr[i] != 'z') 
+	error->all(FLERR,"Balance dynamic string is invalid");
+      if (bstr[i] == 'z' && dimension == 2) 
+	error->all(FLERR,"Balance dynamic string is invalid for 2d simulation");
+    }
+  }
+
+  // insure atoms are in current box & update box via shrink-wrap
+  // no exchange() since doesn't matter if atoms are assigned to correct procs
+
+  if (domain->triclinic) domain->x2lamda(atom->nlocal);
+  domain->pbc();
+  domain->reset_box();
+  if (domain->triclinic) domain->lamda2x(atom->nlocal);
+
+  // imbinit = initial imbalance
+  // use current splits instead of nlocal since atoms may not be in sub-box
+
+  domain->x2lamda(atom->nlocal);
+  int maxinit;
+  double imbinit = imbalance_splits(maxinit);
+  domain->lamda2x(atom->nlocal);
+
+  // debug output of initial state
+
+  //dumpout(update->ntimestep);
+
+  // explicit setting of sub-domain sizes
+
+  if (xflag == UNIFORM) {
+    for (int i = 0; i < procgrid[0]; i++) 
+      comm->xsplit[i] = i * 1.0/procgrid[0];
+    comm->xsplit[procgrid[0]] = 1.0;
+  }
+
+  if (yflag == UNIFORM) {
+    for (int i = 0; i < procgrid[1]; i++) 
+      comm->ysplit[i] = i * 1.0/procgrid[1];
+    comm->ysplit[procgrid[1]] = 1.0;
+  }
+
+  if (zflag == UNIFORM) {
+    for (int i = 0; i < procgrid[2]; i++) 
+      comm->zsplit[i] = i * 1.0/procgrid[2];
+    comm->zsplit[procgrid[2]] = 1.0;
+  }
+
+  if (xflag == USER)
+    for (int i = 0; i <= procgrid[0]; i++) comm->xsplit[i] = user_xsplit[i];
+
+  if (yflag == USER)
+    for (int i = 0; i <= procgrid[1]; i++) comm->ysplit[i] = user_ysplit[i];
+
+  if (zflag == USER)
+    for (int i = 0; i <= procgrid[2]; i++) comm->zsplit[i] = user_zsplit[i];
+
+  // dynamic load-balance of sub-domain sizes
+
+  if (dflag) {
+    dynamic_setup(bstr);
+    dynamic();
+  } else count = 0;
+
+  // debug output of final result
+
+  //dumpout(-1);
+
+  // reset comm->uniform flag if necessary
+
+  if (comm->uniform) {
+    if (xflag == USER || xflag == DYNAMIC) comm->uniform = 0;
+    if (yflag == USER || yflag == DYNAMIC) comm->uniform = 0;
+    if (zflag == USER || zflag == DYNAMIC) comm->uniform = 0;
+  } else {
+    if (dimension == 3) {
+      if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
+	comm->uniform = 1;
+    } else {
+      if (xflag == UNIFORM && yflag == UNIFORM) comm->uniform = 1;
+    }
+  }
+
+  // reset proc sub-domains
+
+  if (domain->triclinic) domain->set_lamda_box();
+  domain->set_local_box();
+
+  // move atoms to new processors via irregular()
+
+  if (domain->triclinic) domain->x2lamda(atom->nlocal);
+  Irregular *irregular = new Irregular(lmp);
+  irregular->migrate_atoms();
+  delete irregular;
+  if (domain->triclinic) domain->lamda2x(atom->nlocal);
+
+  // check if any atoms were lost
+
+  bigint natoms;
+  bigint nblocal = atom->nlocal;
+  MPI_Allreduce(&nblocal,&natoms,1,MPI_LMP_BIGINT,MPI_SUM,world);
+  if (natoms != atom->natoms) {
+    char str[128];
+    sprintf(str,"Lost atoms via balance: original " BIGINT_FORMAT 
+	    " current " BIGINT_FORMAT,atom->natoms,natoms);
+    error->all(FLERR,str);
+  }
+
+  // imbfinal = final imbalance based on final nlocal
+
+  int maxfinal;
+  double imbfinal = imbalance_nlocal(maxfinal);
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"  iteration count = %d\n",count);
+      fprintf(screen,"  initial/final max atoms/proc = %d %d\n",
+	      maxinit,maxfinal);
+      fprintf(screen,"  initial/final imbalance factor = %g %g\n",
+	      imbinit,imbfinal);
+    }
+    if (logfile) {
+      fprintf(logfile,"  iteration count = %d\n",count);
+      fprintf(logfile,"  initial/final max atoms/proc = %d %d\n",
+	      maxinit,maxfinal);
+      fprintf(logfile,"  initial/final imbalance factor = %g %g\n",
+	      imbinit,imbfinal);
+    }
+  }
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"  x cuts:");
+      for (int i = 0; i <= comm->procgrid[0]; i++)
+	fprintf(screen," %g",comm->xsplit[i]);
+      fprintf(screen,"\n");
+      fprintf(screen,"  y cuts:");
+      for (int i = 0; i <= comm->procgrid[1]; i++)
+	fprintf(screen," %g",comm->ysplit[i]);
+      fprintf(screen,"\n");
+      fprintf(screen,"  z cuts:");
+      for (int i = 0; i <= comm->procgrid[2]; i++)
+	fprintf(screen," %g",comm->zsplit[i]);
+      fprintf(screen,"\n");
+    }
+    if (logfile) {
+      fprintf(logfile,"  x cuts:");
+      for (int i = 0; i <= comm->procgrid[0]; i++)
+	fprintf(logfile," %g",comm->xsplit[i]);
+      fprintf(logfile,"\n");
+      fprintf(logfile,"  y cuts:");
+      for (int i = 0; i <= comm->procgrid[1]; i++)
+	fprintf(logfile," %g",comm->ysplit[i]);
+      fprintf(logfile,"\n");
+      fprintf(logfile,"  z cuts:");
+      for (int i = 0; i <= comm->procgrid[2]; i++)
+	fprintf(logfile," %g",comm->zsplit[i]);
+      fprintf(logfile,"\n");
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   calculate imbalance based on nlocal
+   return max = max atom per proc
+   return imbalance factor = max atom per proc / ave atom per proc
+------------------------------------------------------------------------- */
+
+double Balance::imbalance_nlocal(int &max)
+{
+  MPI_Allreduce(&atom->nlocal,&max,1,MPI_INT,MPI_MAX,world);
+  double imbalance = max / (1.0 * atom->natoms / nprocs);
+  return imbalance;
+}
+
+/* ----------------------------------------------------------------------
+   calculate imbalance based on processor splits in 3 dims
+   atoms must be in lamda coords (0-1) before called
+   map atoms to 3d grid of procs
+   return max = max atom per proc
+   return imbalance factor = max atom per proc / ave atom per proc
+------------------------------------------------------------------------- */
+
+double Balance::imbalance_splits(int &max)
+{
+  double *xsplit = comm->xsplit;
+  double *ysplit = comm->ysplit;
+  double *zsplit = comm->zsplit;
+
+  int nx = comm->procgrid[0];
+  int ny = comm->procgrid[1];
+  int nz = comm->procgrid[2];
+
+  for (int i = 0; i < nprocs; i++) pcount[i] = 0;
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+  int ix,iy,iz;
+
+  for (int i = 0; i < nlocal; i++) {
+    ix = binary(x[i][0],nx,xsplit);
+    iy = binary(x[i][1],ny,ysplit);
+    iz = binary(x[i][2],nz,zsplit);
+    pcount[iz*nx*ny + iy*nx + ix]++;
+  }
+
+  MPI_Allreduce(pcount,allcount,nprocs,MPI_INT,MPI_SUM,world);
+  max = 0;
+  for (int i = 0; i < nprocs; i++) max = MAX(max,allcount[i]);
+  double imbalance = max / (1.0 * atom->natoms / nprocs);
+  return imbalance;
+}
+
+/* ----------------------------------------------------------------------
+   setup dynamic load balance operations
+   called from command & fix balance
+------------------------------------------------------------------------- */
+
+void Balance::dynamic_setup(char *str)
+{
+  nops = strlen(str);
+  ops = new int[nops];
+
+  for (int i = 0; i < strlen(str); i++) {
+    if (str[i] == 'x') ops[i] = X;
+    if (str[i] == 'y') ops[i] = Y;
+    if (str[i] == 'z') ops[i] = Z;
+  }
+  
+  splits[0] = comm->xsplit;
+  splits[1] = comm->ysplit;
+  splits[2] = comm->zsplit;
+
+  counts[0] = new bigint[comm->procgrid[0]];
+  counts[1] = new bigint[comm->procgrid[1]];
+  counts[2] = new bigint[comm->procgrid[2]];
+
+  int max = MAX(comm->procgrid[0],comm->procgrid[1]);
+  max = MAX(max,comm->procgrid[2]);
+  cuts = new double[max+1];
+  onecount = new bigint[max];
+
+  //MPI_Comm_split(world,comm->myloc[0],0,&commslice[0]);
+  //MPI_Comm_split(world,comm->myloc[1],0,&commslice[1]);
+  //MPI_Comm_split(world,comm->myloc[2],0,&commslice[2]);
+}
+
+/* ----------------------------------------------------------------------
+   perform dynamic load balance by changing xyz split proc boundaries in Comm
+   called from command and fix balance
+------------------------------------------------------------------------- */
+
+void Balance::dynamic()
+{
+  int i,m,max;
+  double imbfactor;
+
+  int *procgrid = comm->procgrid;
+
+  domain->x2lamda(atom->nlocal);
+
+  count = 0;
+  for (int irepeat = 0; irepeat < nrepeat; irepeat++) {
+    for (i = 0; i < nops; i++) {
+      for (m = 0; m < niter; m++) {
+	stats(ops[i],procgrid[ops[i]],splits[ops[i]],counts[ops[i]]);
+	adjust(procgrid[ops[i]],counts[ops[i]],splits[ops[i]]);
+	count++;
+	// debug output of intermediate result
+	//dumpout(-1);
+      }
+      imbfactor = imbalance_splits(max);
+      if (imbfactor <= thresh) break;
+    }
+    if (i < nops) break;
+  }
+
+  domain->lamda2x(atom->nlocal);
+}
+
+/* ----------------------------------------------------------------------
+   count atoms in each slice
+   current cuts may be very different than original cuts,
+   so use binary search to find which slice each atom is in
+------------------------------------------------------------------------- */
+
+void Balance::stats(int dim, int n, double *split, bigint *count)
+{
+  for (int i = 0; i < n; i++) onecount[i] = 0;
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+  int index;
+
+  for (int i = 0; i < nlocal; i++) {
+    index = binary(x[i][dim],n,split);
+    onecount[index]++;
+  }
+
+  MPI_Allreduce(onecount,count,n,MPI_LMP_BIGINT,MPI_SUM,world);
+}
+
+/* ----------------------------------------------------------------------
+   adjust cuts between N slices in a dim via diffusive method
+   count = atoms per slice
+   split = current N+1 cuts, with 0.0 and 1.0 at end points
+   overwrite split with new cuts
+   diffusion means slices with more atoms than their neighbors
+     "send" them atoms by moving cut closer to sender, further from receiver
+------------------------------------------------------------------------- */
+
+void Balance::adjust(int n, bigint *count, double *split)
+{
+  // damping factor
+
+  double damp = 0.5;
+
+  // loop over slices from 1 to N-2 inclusive (not end slices 0 and N-1)
+  // cut I is between 2 slices (I-1 and I) with counts
+  // cut I+1 is between 2 slices (I and I+1) with counts
+  // for a cut between 2 slices, only slice with larger count adjusts it
+
+  bigint leftcount,mycount,rightcount;
+  double rho,target,targetleft,targetright;
+
+  for (int i = 1; i < n-1; i++) {
+    leftcount = count[i-1];
+    mycount = count[i];
+    rightcount = count[i+1];
+
+    // middle slice is <= both left and right, so do nothing
+    // special case if 2 slices both have count = 0, no change in cut
+
+    if (mycount <= leftcount && mycount <= rightcount) {
+      if (leftcount == 0) cuts[i] = split[i];
+      if (rightcount == 0) cuts[i+1] = split[i+1];
+      continue;
+    }
+
+    // rho = density of atoms in the slice
+
+    rho = mycount / (split[i+1] - split[i]);
+
+    // middle slice has more atoms then left or right slice
+    // if delta from middle to top slice > delta between top and bottom slice
+    //   then send atoms both dirs to bring all 3 slices to same count
+    // else bottom slice is very low, so send atoms only in that dir
+
+    if (mycount > leftcount && mycount > rightcount) {
+      if (mycount-MAX(leftcount,rightcount) >= fabs(leftcount-rightcount)) {
+	if (leftcount <= rightcount) {
+	  targetleft = damp * 
+	    (rightcount-leftcount + (mycount-rightcount)/3.0);
+	  targetright = damp * (mycount-rightcount)/3.0;
+	  cuts[i] = split[i] + targetleft/rho;
+	  cuts[i+1] = split[i+1] - targetright/rho;
+	} else {
+	  targetleft = damp * (mycount-leftcount)/3.0;
+	  targetright = damp * 
+	    (leftcount-rightcount + (mycount-leftcount)/3.0);
+	  cuts[i] = split[i] + targetleft/rho;
+	  cuts[i+1] = split[i+1] - targetright/rho;
+	}
+      } else if (leftcount < rightcount) {
+	target = damp * 0.5*(mycount-leftcount);
+	cuts[i] = split[i] + target/rho;
+	cuts[i+1] = split[i+1];
+      } else if (rightcount < leftcount) {
+	target = damp * 0.5*(mycount-rightcount);
+	cuts[i+1] = split[i+1] - target/rho;
+	cuts[i] = split[i];
+      }
+
+    // middle slice has more atoms than only left or right slice
+    // send atoms only in that dir
+
+    } else if (mycount > leftcount) {
+      target = damp * 0.5*(mycount-leftcount);
+      cuts[i] = split[i] + target/rho;
+    } else if (mycount > rightcount) {
+      target = damp * 0.5*(mycount-rightcount);
+      cuts[i+1] = split[i+1] - target/rho;
+    }
+  }
+
+  // overwrite adjustable splits with new cuts
+
+  for (int i = 1; i < n; i++) split[i] = cuts[i];
+}
+
+/* ----------------------------------------------------------------------
+   binary search for value in N-length ascending vec
+   value may be outside range of vec limits
+   always return index from 0 to N-1 inclusive
+   return 0 if value < vec[0]
+   reutrn N-1 if value >= vec[N-1]
+   return index = 1 to N-2 if vec[index] <= value < vec[index+1]
+------------------------------------------------------------------------- */
+
+int Balance::binary(double value, int n, double *vec)
+{
+  int lo = 0;
+  int hi = n-1;
+
+  if (value < vec[lo]) return lo;
+  if (value >= vec[hi]) return hi;
+
+  // insure vec[lo] <= value < vec[hi] at every iteration
+  // done when lo,hi are adjacent
+
+  int index = (lo+hi)/2;
+  while (lo < hi-1) {
+    if (value < vec[index]) hi = index;
+    else if (value >= vec[index]) lo = index;
+    index = (lo+hi)/2;
+  }
+
+  return index;
+}
+
+/* ----------------------------------------------------------------------
+   create dump file of line segments in Pizza.py mdump mesh format
+   just for debug/viz purposes
+   write to tmp.mdump.*, open first time if necessary
+   write xy lines around each proc's sub-domain for 2d
+   write xyz cubes around each proc's sub-domain for 3d
+   all procs but 0 just return
+------------------------------------------------------------------------- */
+
+void Balance::dumpout(bigint tstamp)
+{
+  if (me) return;
+
+  bigint tstep;
+  if (tstamp >= 0) tstep = tstamp;
+  else tstep = laststep + 1;
+  laststep = tstep;
+
+  if (fp == NULL) {
+    char file[32];
+    sprintf(file,"tmp.mdump.%ld",tstep);
+    fp = fopen(file,"w");
+    if (!fp) error->one(FLERR,"Cannot open balance output file");
+
+    // write out one square/cute per processor for 2d/3d
+    // only write once since topology is static
+
+    fprintf(fp,"ITEM: TIMESTEP\n");
+    fprintf(fp,"%ld\n",tstep);
+    fprintf(fp,"ITEM: NUMBER OF SQUARES\n");
+    fprintf(fp,"%d\n",nprocs);
+    fprintf(fp,"ITEM: SQUARES\n");
+
+    int nx = comm->procgrid[0] + 1;
+    int ny = comm->procgrid[1] + 1;
+    int nz = comm->procgrid[2] + 1;
+
+    if (domain->dimension == 2) {
+      int m = 0;
+      for (int j = 0; j < comm->procgrid[1]; j++)
+	for (int i = 0; i < comm->procgrid[0]; i++) {
+	  int c1 = j*nx + i + 1;
+	  int c2 = c1 + 1;
+	  int c3 = c2 + nx;
+	  int c4 = c3 - 1;
+	  fprintf(fp,"%d %d %d %d %d %d\n",m+1,m+1,c1,c2,c3,c4);
+	  m++;
+	}
+
+    } else {
+      int m = 0;
+      for (int k = 0; k < comm->procgrid[2]; k++)
+	for (int j = 0; j < comm->procgrid[1]; j++)
+	  for (int i = 0; i < comm->procgrid[0]; i++) {
+	    int c1 = k*ny*nx + j*nx + i + 1;
+	    int c2 = c1 + 1;
+	    int c3 = c2 + nx;
+	    int c4 = c3 - 1;
+	    int c5 = c1 + ny*nx;
+	    int c6 = c2 + ny*nx;
+	    int c7 = c3 + ny*nx;
+	    int c8 = c4 + ny*nx;
+	    fprintf(fp,"%d %d %d %d %d %d %d %d %d %d\n",
+		    m+1,m+1,c1,c2,c3,c4,c5,c6,c7,c8);
+	    m++;
+	  }
+    }
+  }
+
+  // write out nodal coords, can be different every call
+  // scale xsplit,ysplit,zsplit values to full box
+  // only implmented for orthogonal boxes, not triclinic
+
+  int nx = comm->procgrid[0] + 1;
+  int ny = comm->procgrid[1] + 1;
+  int nz = comm->procgrid[2] + 1;
+
+  double *boxlo = domain->boxlo;
+  double *boxhi = domain->boxhi;
+  double *prd = domain->prd;
+
+  fprintf(fp,"ITEM: TIMESTEP\n");
+  fprintf(fp,"%ld\n",tstep);
+  fprintf(fp,"ITEM: NUMBER OF NODES\n");
+  if (domain->dimension == 2) 
+    fprintf(fp,"%d\n",nx*ny);
+  else 
+    fprintf(fp,"%d\n",nx*ny*nz);
+  fprintf(fp,"ITEM: BOX BOUNDS\n");
+  fprintf(fp,"%g %g\n",boxlo[0],boxhi[0]);
+  fprintf(fp,"%g %g\n",boxlo[1],boxhi[1]);
+  fprintf(fp,"%g %g\n",boxlo[2],boxhi[2]);
+  fprintf(fp,"ITEM: NODES\n");
+
+  if (domain->dimension == 2) {
+    int m = 0;
+    for (int j = 0; j < ny; j++)
+      for (int i = 0; i < nx; i++) {
+	fprintf(fp,"%d %d %g %g %g\n",m+1,1,
+		boxlo[0] + prd[0]*comm->xsplit[i],
+		boxlo[1] + prd[1]*comm->ysplit[j],
+		0.0);
+	m++;
+      }
+  } else {
+    int m = 0;
+    for (int k = 0; k < nz; k++)
+      for (int j = 0; j < ny; j++)
+	for (int i = 0; i < nx; i++) {
+	  fprintf(fp,"%d %d %g %g %g\n",m+1,1,
+		  boxlo[0] + prd[0]*comm->xsplit[i],
+		  boxlo[1] + prd[1]*comm->ysplit[j],
+		  boxlo[2] + prd[2]*comm->zsplit[j]);
+	  m++;
+      }
+  }
+}
--- a/src/balance.h
+++ b/src/balance.h
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMMAND_CLASS
+
+CommandStyle(balance,Balance)
+
+#else
+
+#ifndef LMP_BALANCE_H
+#define LMP_BALANCE_H
+
+#include "mpi.h"
+#include "stdio.h"
+#include "pointers.h"
+
+namespace LAMMPS_NS {
+
+class Balance : protected Pointers {
+ public:
+  Balance(class LAMMPS *);
+  ~Balance();
+  void command(int, char **);
+
+ private:
+  int me,nprocs;
+  int xflag,yflag,zflag,dflag;
+  int nrepeat,niter;
+  double thresh;
+  char *bstr;
+  double *user_xsplit,*user_ysplit,*user_zsplit;
+
+  int *ops;
+  int nops;
+  double *splits[3];
+  bigint *counts[3];
+  double *cuts;
+  bigint *onecount;
+  MPI_Comm commslice[3];
+
+  int count;
+  int *pcount,*allcount;
+
+  FILE *fp;                      // for debug output
+  bigint laststep;
+
+  double imbalance_splits(int &);
+  double imbalance_nlocal(int &);
+  void dynamic_setup(char *);
+  void dynamic();
+  void stats(int, int, double *, bigint *);
+  void adjust(int, bigint *, double *);
+  int binary(double, int, double *);
+
+  void dumpout(bigint);          // for debug output
+};
+
+}
+
+#endif
+#endif
--- a/src/change_box.cpp
+++ b/src/change_box.cpp
@ -150,13 +150,13 @@ void ChangeBox::command(int narg, char **arg)
      iarg += 4;

    } else if (strcmp(arg[iarg],"ortho") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal change_box command");
+      if (iarg+1 > narg) error->all(FLERR,"Illegal change_box command");
      ops[nops].style = ORTHO;
      nops++;
      iarg += 1;

    } else if (strcmp(arg[iarg],"triclinic") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal change_box command");
+      if (iarg+1 > narg) error->all(FLERR,"Illegal change_box command");
      ops[nops].style = TRICLINIC;
      nops++;
      iarg += 1;
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -78,6 +78,8 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)

  bordergroup = 0;
  style = SINGLE;
+  uniform = 1;
+  xsplit = ysplit = zsplit = NULL;
  multilo = multihi = NULL;
  cutghostmulti = NULL;
  cutghostuser = 0.0;
@ -124,6 +126,10 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)

 Comm::~Comm()
 {
+  memory->destroy(xsplit);
+  memory->destroy(ysplit);
+  memory->destroy(zsplit);
+
  delete [] customfile;
  delete [] outfile;

@ -145,7 +151,7 @@ Comm::~Comm()

 /* ----------------------------------------------------------------------
   create 3d grid of procs based on Nprocs and box size & shape
-   map processors to grid
+   map processors to grid, setup xyz split for a uniform grid
 ------------------------------------------------------------------------- */

 void Comm::set_proc_grid()
@ -252,14 +258,31 @@ void Comm::set_proc_grid()

  if (outfile) pmap->output(outfile,procgrid,grid2proc);

-  // set lamda box params after procs are assigned
-
-  if (domain->triclinic) domain->set_lamda_box();
-
  // free ProcMap class

  delete pmap;

+  // set xsplit,ysplit,zsplit for uniform spacings
+
+  memory->destroy(xsplit);
+  memory->destroy(ysplit);
+  memory->destroy(zsplit);
+
+  memory->create(xsplit,procgrid[0]+1,"comm:xsplit");
+  memory->create(ysplit,procgrid[1]+1,"comm:ysplit");
+  memory->create(zsplit,procgrid[2]+1,"comm:zsplit");
+
+  for (int i = 0; i < procgrid[0]; i++) xsplit[i] = i * 1.0/procgrid[0];
+  for (int i = 0; i < procgrid[1]; i++) ysplit[i] = i * 1.0/procgrid[1];
+  for (int i = 0; i < procgrid[2]; i++) zsplit[i] = i * 1.0/procgrid[2];
+
+  xsplit[procgrid[0]] = ysplit[procgrid[1]] = zsplit[procgrid[2]] = 1.0;
+
+  // set lamda box params after procs are assigned
+  // only set once unless load-balancing occurs
+
+  if (domain->triclinic) domain->set_lamda_box();
+
  // send my 3d proc grid to another partition if requested

  if (send_to_partition >= 0) {
@ -399,25 +422,112 @@ void Comm::setup()
    }
  }

-  // need = # of procs I need atoms from in each dim based on max cutoff
-  // for 2d, don't communicate in z
-
-  need[0] = static_cast<int> (cutghost[0] * procgrid[0] / prd[0]) + 1;
-  need[1] = static_cast<int> (cutghost[1] * procgrid[1] / prd[1]) + 1;
-  need[2] = static_cast<int> (cutghost[2] * procgrid[2] / prd[2]) + 1;
-  if (domain->dimension == 2) need[2] = 0;
-
-  // if non-periodic, do not communicate further than procgrid-1 away
-  // this enables very large cutoffs in non-periodic systems
+  // recvneed[idim][0/1] = # of procs away I recv atoms from, within cutghost
+  //   0 = from left, 1 = from right
+  //   do not cross non-periodic boundaries, need[2] = 0 for 2d
+  // sendneed[idim][0/1] = # of procs away I send atoms to
+  //   0 = to left, 1 = to right
+  //   set equal to recvneed[idim][1/0] of neighbor proc
+  // maxneed[idim] = max procs away any proc recvs atoms in either direction
+  // uniform = 1 = uniform sized sub-domains:
+  //   maxneed is directly computable from sub-domain size
+  //     limit to procgrid-1 for non-PBC
+  //   recvneed = maxneed except for procs near non-PBC
+  //   sendneed = recvneed of neighbor on each side
+  // uniform = 0 = non-uniform sized sub-domains:
+  //   compute recvneed via updown() which accounts for non-PBC
+  //   sendneed = recvneed of neighbor on each side
+  //   maxneed via Allreduce() of recvneed

  int *periodicity = domain->periodicity;
-  if (periodicity[0] == 0) need[0] = MIN(need[0],procgrid[0]-1);
-  if (periodicity[1] == 0) need[1] = MIN(need[1],procgrid[1]-1);
-  if (periodicity[2] == 0) need[2] = MIN(need[2],procgrid[2]-1);
+  int left,right;
+
+  if (uniform) {
+    maxneed[0] = static_cast<int> (cutghost[0] * procgrid[0] / prd[0]) + 1;
+    maxneed[1] = static_cast<int> (cutghost[1] * procgrid[1] / prd[1]) + 1;
+    maxneed[2] = static_cast<int> (cutghost[2] * procgrid[2] / prd[2]) + 1;
+    if (domain->dimension == 2) maxneed[2] = 0;
+    if (!periodicity[0]) maxneed[0] = MIN(maxneed[0],procgrid[0]-1);
+    if (!periodicity[1]) maxneed[1] = MIN(maxneed[1],procgrid[1]-1);
+    if (!periodicity[2]) maxneed[2] = MIN(maxneed[2],procgrid[2]-1);
+
+    if (!periodicity[0]) {
+      recvneed[0][0] = MIN(maxneed[0],myloc[0]);
+      recvneed[0][1] = MIN(maxneed[0],procgrid[0]-myloc[0]-1);
+      left = myloc[0] - 1;
+      if (left < 0) left = procgrid[0] - 1;
+      sendneed[0][0] = MIN(maxneed[0],procgrid[0]-left-1);
+      right = myloc[0] + 1;
+      if (right == procgrid[0]) right = 0;
+      sendneed[0][1] = MIN(maxneed[0],right);
+    } else recvneed[0][0] = recvneed[0][1] = 
+	     sendneed[0][0] = sendneed[0][1] = maxneed[0];
+
+    if (!periodicity[1]) {
+      recvneed[1][0] = MIN(maxneed[1],myloc[1]);
+      recvneed[1][1] = MIN(maxneed[1],procgrid[1]-myloc[1]-1);
+      left = myloc[1] - 1;
+      if (left < 0) left = procgrid[1] - 1;
+      sendneed[1][0] = MIN(maxneed[1],procgrid[1]-left-1);
+      right = myloc[1] + 1;
+      if (right == procgrid[1]) right = 0;
+      sendneed[1][1] = MIN(maxneed[1],right);
+    } else recvneed[1][0] = recvneed[1][1] = 
+	     sendneed[1][0] = sendneed[1][1] = maxneed[1];
+
+    if (!periodicity[2]) {
+      recvneed[2][0] = MIN(maxneed[2],myloc[2]);
+      recvneed[2][1] = MIN(maxneed[2],procgrid[2]-myloc[2]-1);
+      left = myloc[2] - 1;
+      if (left < 0) left = procgrid[2] - 1;
+      sendneed[2][0] = MIN(maxneed[2],procgrid[2]-left-1);
+      right = myloc[2] + 1;
+      if (right == procgrid[2]) right = 0;
+      sendneed[2][1] = MIN(maxneed[2],right);
+    } else recvneed[2][0] = recvneed[2][1] = 
+	     sendneed[2][0] = sendneed[2][1] = maxneed[2];
+
+  } else {
+    recvneed[0][0] = updown(0,0,myloc[0],prd[0],periodicity[0],xsplit);
+    recvneed[0][1] = updown(0,1,myloc[0],prd[0],periodicity[0],xsplit);
+    left = myloc[0] - 1;
+    if (left < 0) left = procgrid[0] - 1;
+    sendneed[0][0] = updown(0,1,left,prd[0],periodicity[0],xsplit);
+    right = myloc[0] + 1;
+    if (right == procgrid[0]) right = 0;
+    sendneed[0][1] = updown(0,0,right,prd[0],periodicity[0],xsplit);
+
+    recvneed[1][0] = updown(1,0,myloc[1],prd[1],periodicity[1],ysplit);
+    recvneed[1][1] = updown(1,1,myloc[1],prd[1],periodicity[1],ysplit);
+    left = myloc[1] - 1;
+    if (left < 0) left = procgrid[1] - 1;
+    sendneed[1][0] = updown(1,1,left,prd[1],periodicity[1],ysplit);
+    right = myloc[1] + 1;
+    if (right == procgrid[1]) right = 0;
+    sendneed[1][1] = updown(1,0,right,prd[1],periodicity[1],ysplit);
+
+    if (domain->dimension == 3) { 
+      recvneed[2][0] = updown(2,0,myloc[2],prd[2],periodicity[2],zsplit);
+      recvneed[2][1] = updown(2,1,myloc[2],prd[2],periodicity[2],zsplit);
+      left = myloc[2] - 1;
+      if (left < 0) left = procgrid[2] - 1;
+      sendneed[2][0] = updown(2,1,left,prd[2],periodicity[2],zsplit);
+      right = myloc[2] + 1;
+      if (right == procgrid[2]) right = 0;
+      sendneed[2][1] = updown(2,0,right,prd[2],periodicity[2],zsplit);
+    } else recvneed[2][0] = recvneed[2][1] = 
+	     sendneed[2][0] = sendneed[2][1] = 0;
+
+    int all[6];
+    MPI_Allreduce(&recvneed[0][0],all,6,MPI_INT,MPI_MAX,world);
+    maxneed[0] = MAX(all[0],all[1]);
+    maxneed[1] = MAX(all[2],all[3]);
+    maxneed[2] = MAX(all[4],all[5]);
+  }

  // allocate comm memory

-  nswap = 2 * (need[0]+need[1]+need[2]);
+  nswap = 2 * (maxneed[0]+maxneed[1]+maxneed[2]);
  if (nswap > maxswap) grow_swap(nswap);

  // setup parameters for each exchange:
@ -428,13 +538,11 @@ void Comm::setup()
  //   use -BIG/midpt/BIG to insure all atoms included even if round-off occurs
  //   if round-off, atoms recvd across PBC can be < or > than subbox boundary
  //   note that borders() only loops over subset of atoms during each swap
-  //   set slablo > slabhi for swaps across non-periodic boundaries
-  //     this insures no atoms are swapped
-  //     only for procs owning sub-box at non-periodic end of global box
+  //   treat all as PBC here, non-PBC is handled in borders() via r/s need[][]
  // for style MULTI:
-  //   multilo/multihi is same as slablo/slabhi, only for each atom type
+  //   multilo/multihi is same, with slablo/slabhi for each atom type
  // pbc_flag: 0 = nothing across a boundary, 1 = something across a boundary
-  // pbc = -1/0/1 for PBC factor in each of 3/6 orthog/triclinic dirs
+  // pbc = -1/0/1 for PBC factor in each of 3/6 orthogonal/triclinic dirs
  // for triclinic, slablo/hi and pbc_border will be used in lamda (0-1) coords
  // 1st part of if statement is sending to the west/south/down
  // 2nd part of if statement is sending to the east/north/up
@ -443,7 +551,7 @@ void Comm::setup()

  int iswap = 0;
  for (dim = 0; dim < 3; dim++) {
-    for (ineed = 0; ineed < 2*need[dim]; ineed++) {
+    for (ineed = 0; ineed < 2*maxneed[dim]; ineed++) {
      pbc_flag[iswap] = 0;
      pbc[iswap][0] = pbc[iswap][1] = pbc[iswap][2] =
 	pbc[iswap][3] = pbc[iswap][4] = pbc[iswap][5] = 0;
@ -463,12 +571,6 @@ void Comm::setup()
 	  }
 	}
 	if (myloc[dim] == 0) {
-	  if (periodicity[dim] == 0) {
-	    if (style == SINGLE) slabhi[iswap] = slablo[iswap] - 1.0;
-	    else 
-	      for (i = 1; i <= ntypes; i++)
-		multihi[iswap][i] = multilo[iswap][i] - 1.0;
-	  } else {
 	  pbc_flag[iswap] = 1;
 	  pbc[iswap][dim] = 1;
 	  if (triclinic) {
@ -476,7 +578,6 @@ void Comm::setup()
 	    else if (dim == 2) pbc[iswap][4] = pbc[iswap][3] = 1;
 	  }
 	}
-	}
 	
      } else {
 	sendproc[iswap] = procneigh[dim][1];
@ -493,12 +594,6 @@ void Comm::setup()
 	  }
 	}
 	if (myloc[dim] == procgrid[dim]-1) {
-	  if (periodicity[dim] == 0) {
-	    if (style == SINGLE) slabhi[iswap] = slablo[iswap] - 1.0;
-	    else
-	      for (i = 1; i <= ntypes; i++)
-		multihi[iswap][i] = multilo[iswap][i] - 1.0;
-	  } else {
 	  pbc_flag[iswap] = 1;
 	  pbc[iswap][dim] = -1;
 	  if (triclinic) {
@ -507,13 +602,61 @@ void Comm::setup()
 	  }
 	}
      }
-      }
      
      iswap++;
    }
  }
 }

+/* ----------------------------------------------------------------------
+   walk up/down the extent of nearby processors in dim and dir
+   loc = myloc of proc to start at
+   dir = 0/1 = walk to left/right
+   do not cross non-periodic boundaries
+   is not called for z dim in 2d
+   return how many procs away are needed to encompass cutghost away from loc
+------------------------------------------------------------------------- */
+
+int Comm::updown(int dim, int dir, int loc, 
+		 double prd, int periodicity, double *split)
+{
+  int index,count;
+  double frac,delta;
+
+  if (dir == 0) {
+    frac = cutghost[dim]/prd;
+    index = loc - 1;
+    delta = 0.0;
+    count = 0;
+    while (delta < frac) {
+      if (index < 0) {
+	if (!periodicity) break;
+	index = procgrid[dim] - 1;
+      }
+      count++;
+      delta += split[index+1] - split[index];
+      index--;
+    }
+
+  } else {
+    frac = cutghost[dim]/prd;
+    index = loc + 1;
+    delta = 0.0;
+    count = 0;
+    while (delta < frac) {
+      if (index >= procgrid[dim]) {
+	if (!periodicity) break;
+	index = 0;
+      }
+      count++;
+      delta += split[index+1] - split[index];
+      index++;
+    }
+  }
+
+  return count;
+}  
+
 /* ----------------------------------------------------------------------
   forward communication of atom coords every timestep
   other per-atom attributes may also be sent via pack/unpack routines
@ -537,27 +680,30 @@ void Comm::forward_comm(int dummy)
      if (comm_x_only) {
 	if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
 	else buf = NULL;
+	if (size_forward_recv[iswap])
 	  MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
 		    recvproc[iswap],0,world,&request);
 	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
 			    buf_send,pbc_flag[iswap],pbc[iswap]);
-	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-	MPI_Wait(&request,&status);
+	if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	if (size_forward_recv[iswap]) MPI_Wait(&request,&status);
      } else if (ghost_velocity) {
+	if (size_forward_recv[iswap])
 	  MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
 		    recvproc[iswap],0,world,&request);
 	n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
 				buf_send,pbc_flag[iswap],pbc[iswap]);
-	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-	MPI_Wait(&request,&status);
+	if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	if (size_forward_recv[iswap]) MPI_Wait(&request,&status);
 	avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv);
      } else {
+	if (size_forward_recv[iswap])
 	  MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
 		    recvproc[iswap],0,world,&request);
 	n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
 			    buf_send,pbc_flag[iswap],pbc[iswap]);
-	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-	MPI_Wait(&request,&status);
+	if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	if (size_forward_recv[iswap]) MPI_Wait(&request,&status);
 	avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
      }

@ -601,19 +747,22 @@ void Comm::reverse_comm()
  for (int iswap = nswap-1; iswap >= 0; iswap--) {
    if (sendproc[iswap] != me) {
      if (comm_f_only) {
+	if (size_reverse_recv[iswap])
 	  MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE,
 		    sendproc[iswap],0,world,&request);
 	if (size_reverse_send[iswap]) buf = f[firstrecv[iswap]];
 	else buf = NULL;
+	if (size_reverse_send[iswap])
 	  MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
 		   recvproc[iswap],0,world);
-	MPI_Wait(&request,&status);
+	if (size_reverse_recv[iswap]) MPI_Wait(&request,&status);
      } else {
+	if (size_reverse_recv[iswap])
 	  MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE,
 		    sendproc[iswap],0,world,&request);
 	n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
-	MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world);
-	MPI_Wait(&request,&status);
+	if (n) MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world);
+	if (size_reverse_recv[iswap]) MPI_Wait(&request,&status);
      }
      avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv);

@ -752,8 +901,8 @@ void Comm::exchange()

 void Comm::borders()
 {
-  int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax;
-  int nsend,nrecv,nfirst,nlast,ngroup;
+  int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax;
+  int nsend,nrecv,sendflag,nfirst,nlast,ngroup;
  double lo,hi;
  int *type;
  double **x;
@ -774,8 +923,8 @@ void Comm::borders()

  for (dim = 0; dim < 3; dim++) {
    nlast = 0;
-    maxneed = 2*need[dim];
-    for (ineed = 0; ineed < maxneed; ineed++) {
+    twoneed = 2*maxneed[dim];
+    for (ineed = 0; ineed < twoneed; ineed++) {

      // find atoms within slab boundaries lo/hi using <= and >=
      // check atoms between nfirst and nlast
@ -799,11 +948,19 @@ void Comm::borders()

      nsend = 0;

+      // sendflag = 0 if I do not send on this swap
+      // sendneed test indicates receiver no longer requires data
+      // e.g. due to non-PBC or non-uniform sub-domains
+
+      if (ineed/2 >= sendneed[dim][ineed % 2]) sendflag = 0;
+      else sendflag = 1;
+
      // find send atoms according to SINGLE vs MULTI
      // all atoms eligible versus atoms in bordergroup
      // only need to limit loop to bordergroup for first sends (ineed < 2)
      // on these sends, break loop in two: owned (in group) and ghost

+      if (sendflag) {
 	if (!bordergroup || ineed >= 2) {
 	  if (style == SINGLE) {
 	    for (i = nfirst; i < nlast; i++)
@ -852,6 +1009,7 @@ void Comm::borders()
 	    }
 	  }
 	}
+      }

      // pack up list of border atoms

@ -865,18 +1023,18 @@ void Comm::borders()
 			      pbc_flag[iswap],pbc[iswap]);
      
      // swap atoms with other proc
+      // no MPI calls except SendRecv if nsend/nrecv = 0
      // put incoming ghosts at end of my atom arrays
      // if swapping with self, simply copy, no messages

      if (sendproc[iswap] != me) {
 	MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
 		     &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
-	if (nrecv*size_border > maxrecv) 
-	  grow_recv(nrecv*size_border);
-	MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
+	if (nrecv*size_border > maxrecv) grow_recv(nrecv*size_border);
+	if (nrecv) MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
 			     recvproc[iswap],0,world,&request);
-	MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-	MPI_Wait(&request,&status);
+	if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+	if (nrecv) MPI_Wait(&request,&status);
 	buf = buf_recv;
      } else {
 	nrecv = nsend;
@ -939,10 +1097,12 @@ void Comm::forward_comm_pair(Pair *pair)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (recvnum[iswap])
 	MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
 		  world,&request);
+      if (sendnum[iswap])
 	MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (recvnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -973,10 +1133,12 @@ void Comm::reverse_comm_pair(Pair *pair)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (sendnum[iswap]) 
 	MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,
 		  world,&request);
+      if (recvnum[iswap])
 	MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (sendnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -1008,10 +1170,12 @@ void Comm::forward_comm_fix(Fix *fix)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (recvnum[iswap]) 
 	MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
 		  world,&request);
+      if (sendnum[iswap]) 
 	MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (recvnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -1042,10 +1206,12 @@ void Comm::reverse_comm_fix(Fix *fix)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (sendnum[iswap]) 
 	MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,
 		  world,&request);
+      if (recvnum[iswap]) 
 	MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (sendnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -1077,10 +1243,12 @@ void Comm::forward_comm_compute(Compute *compute)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (recvnum[iswap]) 
 	MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
 		  world,&request);
+      if (sendnum[iswap]) 
 	MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (recvnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -1111,10 +1279,12 @@ void Comm::reverse_comm_compute(Compute *compute)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (sendnum[iswap]) 
 	MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,
 		  world,&request);
+      if (recvnum[iswap]) 
 	MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (sendnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -1146,10 +1316,12 @@ void Comm::forward_comm_dump(Dump *dump)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (recvnum[iswap]) 
 	MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
 		  world,&request);
+      if (sendnum[iswap]) 
 	MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (recvnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -1180,10 +1352,12 @@ void Comm::reverse_comm_dump(Dump *dump)
    // if self, set recv buffer to send buffer

    if (sendproc[iswap] != me) {
+      if (sendnum[iswap]) 
 	MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,
 		  world,&request);
+      if (recvnum[iswap]) 
 	MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world);
-      MPI_Wait(&request,&status);
+      if (sendnum[iswap]) MPI_Wait(&request,&status);
      buf = buf_recv;
    } else buf = buf_send;

@ -1373,6 +1547,10 @@ void Comm::set_processors(int narg, char **arg)
  if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0) 
    error->all(FLERR,"Illegal processors command");

+  int p = user_procgrid[0]*user_procgrid[1]*user_procgrid[2];
+  if (p && p != nprocs)
+    error->all(FLERR,"Specified processors != physical processors");
+
  int iarg = 3;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"grid") == 0) {
--- a/src/comm.h
+++ b/src/comm.h
@ -24,8 +24,10 @@ class Comm : protected Pointers {
  int procgrid[3];                  // procs assigned in each dim of 3d grid
  int user_procgrid[3];             // user request for procs in each dim
  int myloc[3];                     // which proc I am in each dim
-  int procneigh[3][2];              // my 6 neighboring procs
+  int procneigh[3][2];              // my 6 neighboring procs, 0/1 = left/right
  int ghost_velocity;               // 1 if ghost atoms have velocity, 0 if not
+  int uniform;                      // 1 = equal subdomains, 0 = load-balanced
+  double *xsplit,*ysplit,*zsplit;   // fractional (0-1) sub-domain sizes
  double cutghost[3];               // cutoffs used for acquiring ghost atoms
  double cutghostuser;              // user-specified ghost cutoff
  int ***grid2proc;                 // which proc owns i,j,k loc in 3d grid
@ -63,8 +65,10 @@ class Comm : protected Pointers {

 protected:
  int style;                        // single vs multi-type comm
-  int nswap;                        // # of swaps to perform
-  int need[3];                      // procs I need atoms from in each dim
+  int nswap;                        // # of swaps to perform = sum of maxneed
+  int recvneed[3][2];               // # of procs away I recv atoms from
+  int sendneed[3][2];               // # of procs away I send atoms to
+  int maxneed[3];                   // max procs away any proc needs, per dim
  int triclinic;                    // 0 if domain is orthog, 1 if triclinic
  int maxswap;                      // max # of swaps memory is allocated for
  int size_forward;                 // # of per-atom datums in forward comm
@ -106,6 +110,8 @@ class Comm : protected Pointers {
  int maxsend,maxrecv;              // current size of send/recv buffer
  int maxforward,maxreverse;        // max # of datums in forward/reverse comm
 
+  int updown(int, int, int, double, int, double *);
+                                            // compare cutoff to procs
  virtual void grow_send(int,int);          // reallocate send buffer
  virtual void grow_recv(int);              // free/allocate recv buffer
  virtual void grow_list(int, int);         // reallocate one sendlist
--- a/src/domain.cpp
+++ b/src/domain.cpp
@ -215,58 +215,57 @@ void Domain::set_global_box()
 }

 /* ----------------------------------------------------------------------
-   set lamda box params, only need be done one time
-   assumes global box is defined and proc assignment has been made by comm
-   for uppermost proc, insure subhi = 1.0 (in case round-off occurs)
+   set lamda box params
+   assumes global box is defined and proc assignment has been made
+   uses comm->xyz_split to define subbox boundaries in consistent manner
 ------------------------------------------------------------------------- */

 void Domain::set_lamda_box()
 {
  int *myloc = comm->myloc;
-  int *procgrid = comm->procgrid;
+  double *xsplit = comm->xsplit;
+  double *ysplit = comm->ysplit;
+  double *zsplit = comm->zsplit;

-  sublo_lamda[0] = 1.0*myloc[0] / procgrid[0];
-  sublo_lamda[1] = 1.0*myloc[1] / procgrid[1];
-  sublo_lamda[2] = 1.0*myloc[2] / procgrid[2];
+  sublo_lamda[0] = xsplit[myloc[0]];
+  subhi_lamda[0] = xsplit[myloc[0]+1];

-  if (myloc[0] < procgrid[0]-1)
-    subhi_lamda[0] = 1.0*(myloc[0]+1) / procgrid[0];
-  else subhi_lamda[0] = 1.0;
+  sublo_lamda[1] = ysplit[myloc[1]];
+  subhi_lamda[1] = ysplit[myloc[1]+1];

-  if (myloc[1] < procgrid[1]-1)
-    subhi_lamda[1] = 1.0*(myloc[1]+1) / procgrid[1];
-  else subhi_lamda[1] = 1.0;
-
-  if (myloc[2] < procgrid[2]-1)
-    subhi_lamda[2] = 1.0*(myloc[2]+1) / procgrid[2];
-  else subhi_lamda[2] = 1.0;
+  sublo_lamda[2] = zsplit[myloc[2]];
+  subhi_lamda[2] = zsplit[myloc[2]+1];
 }

 /* ----------------------------------------------------------------------
-   set local subbox params
+   set local subbox params for orthogonal boxes
   assumes global box is defined and proc assignment has been made
-   for uppermost proc, insure subhi = boxhi (in case round-off occurs)
+   uses comm->xyz_split to define subbox boundaries in consistent manner
+   insure subhi[max] = boxhi
 ------------------------------------------------------------------------- */

 void Domain::set_local_box()
 {
  int *myloc = comm->myloc;
  int *procgrid = comm->procgrid;
+  double *xsplit = comm->xsplit;
+  double *ysplit = comm->ysplit;
+  double *zsplit = comm->zsplit;

  if (triclinic == 0) {
-    sublo[0] = boxlo[0] + myloc[0] * xprd / procgrid[0];
+    sublo[0] = boxlo[0] + xprd*xsplit[myloc[0]];
    if (myloc[0] < procgrid[0]-1)
-      subhi[0] = boxlo[0] + (myloc[0]+1) * xprd / procgrid[0];
+      subhi[0] = boxlo[0] + xprd*xsplit[myloc[0]+1];
    else subhi[0] = boxhi[0];

-    sublo[1] = boxlo[1] + myloc[1] * yprd / procgrid[1];
+    sublo[1] = boxlo[1] + yprd*ysplit[myloc[1]];
    if (myloc[1] < procgrid[1]-1)
-      subhi[1] = boxlo[1] + (myloc[1]+1) * yprd / procgrid[1];
+      subhi[1] = boxlo[1] + yprd*ysplit[myloc[1]+1];
    else subhi[1] = boxhi[1];

-    sublo[2] = boxlo[2] + myloc[2] * zprd / procgrid[2];
+    sublo[2] = boxlo[2] + zprd*zsplit[myloc[2]];
    if (myloc[2] < procgrid[2]-1)
-      subhi[2] = boxlo[2] + (myloc[2]+1) * zprd / procgrid[2];
+      subhi[2] = boxlo[2] + zprd*zsplit[myloc[2]+1];
    else subhi[2] = boxhi[2];
  }
 }
@ -274,7 +273,7 @@ void Domain::set_local_box()
 /* ----------------------------------------------------------------------
   reset global & local boxes due to global box boundary changes
   if shrink-wrapped, determine atom extent and reset boxlo/hi
-   if triclinic, perform any shrink-wrap in lamda space
+   for triclinic, atoms must be in lamda coords (0-1) before reset_box is called
 ------------------------------------------------------------------------- */

 void Domain::reset_box()
--- a/src/irregular.cpp
+++ b/src/irregular.cpp
@ -83,7 +83,8 @@ void Irregular::migrate_atoms()
  atom->nghost = 0;
  atom->avec->clear_bonus();

-  // subbox bounds for orthogonal or triclinic
+  // subbox bounds for orthogonal or triclinic box
+  // other comm/domain data used by coord2proc()

  double *sublo,*subhi;
  if (triclinic == 0) {
@ -94,6 +95,13 @@ void Irregular::migrate_atoms()
    subhi = domain->subhi_lamda;
  }

+  uniform = comm->uniform;
+  xsplit = comm->xsplit;
+  ysplit = comm->ysplit;
+  zsplit = comm->zsplit;
+  boxlo = domain->boxlo;
+  prd = domain->prd;
+
  // loop over atoms, flag any that are not in my sub-box
  // fill buffer with atoms leaving my box, using < and >=
  // assign which proc it belongs to via coord2proc()
@ -603,26 +611,37 @@ void Irregular::destroy_data()
 /* ----------------------------------------------------------------------
   determine which proc owns atom with coord x[3]
   x will be in box (orthogonal) or lamda coords (triclinic)
+   for uniform = 1, directly calculate owning proc
+   for non-uniform, iteratively find owning proc via binary search
 ------------------------------------------------------------------------- */

 int Irregular::coord2proc(double *x)
 {
  int loc[3];
+
+  if (uniform) {
    if (triclinic == 0) {
-    double *boxlo = domain->boxlo;
-    double *boxhi = domain->boxhi;
-    loc[0] = static_cast<int>
-      (procgrid[0] * (x[0]-boxlo[0]) / (boxhi[0]-boxlo[0]));
-    loc[1] = static_cast<int>
-      (procgrid[1] * (x[1]-boxlo[1]) / (boxhi[1]-boxlo[1]));
-    loc[2] = static_cast<int>
-      (procgrid[2] * (x[2]-boxlo[2]) / (boxhi[2]-boxlo[2]));
+      loc[0] = static_cast<int> (procgrid[0] * (x[0]-boxlo[0]) / prd[0]);
+      loc[1] = static_cast<int>	(procgrid[1] * (x[1]-boxlo[1]) / prd[1]);
+      loc[2] = static_cast<int>	(procgrid[2] * (x[2]-boxlo[2]) / prd[2]);
    } else {
      loc[0] = static_cast<int> (procgrid[0] * x[0]);
      loc[1] = static_cast<int> (procgrid[1] * x[1]);
      loc[2] = static_cast<int> (procgrid[2] * x[2]);
    }

+  } else {
+    if (triclinic == 0) {
+      loc[0] = binary((x[0]-boxlo[0])/prd[0],procgrid[0],xsplit);
+      loc[1] = binary((x[1]-boxlo[1])/prd[1],procgrid[1],ysplit);
+      loc[2] = binary((x[2]-boxlo[2])/prd[2],procgrid[2],zsplit);
+    } else {
+      loc[0] = binary(x[0],procgrid[0],xsplit);
+      loc[1] = binary(x[1],procgrid[1],ysplit);
+      loc[2] = binary(x[2],procgrid[2],zsplit);
+    }
+  }
+
  if (loc[0] < 0) loc[0] = 0;
  if (loc[0] >= procgrid[0]) loc[0] = procgrid[0] - 1;
  if (loc[1] < 0) loc[1] = 0;
@ -633,6 +652,36 @@ int Irregular::coord2proc(double *x)
  return grid2proc[loc[0]][loc[1]][loc[2]];
 }

+/* ----------------------------------------------------------------------
+   binary search for value in N-length ascending vec
+   value may be outside range of vec limits
+   always return index from 0 to N-1 inclusive
+   return 0 if value < vec[0]
+   reutrn N-1 if value >= vec[N-1]
+   return index = 1 to N-2 if vec[index] <= value < vec[index+1]
+------------------------------------------------------------------------- */
+
+int Irregular::binary(double value, int n, double *vec)
+{
+  int lo = 0;
+  int hi = n-1;
+
+  if (value < vec[lo]) return lo;
+  if (value >= vec[hi]) return hi;
+
+  // insure vec[lo] <= value < vec[hi] at every iteration
+  // done when lo,hi are adjacent
+
+  int index = (lo+hi)/2;
+  while (lo < hi-1) {
+    if (value < vec[index]) hi = index;
+    else if (value >= vec[index]) lo = index;
+    index = (lo+hi)/2;
+  }
+
+  return index;
+}
+
 /* ----------------------------------------------------------------------
   realloc the size of the send buffer as needed with BUFFACTOR & BUFEXTRA 
   if flag = 1, realloc
--- a/src/irregular.h
+++ b/src/irregular.h
@ -32,8 +32,12 @@ class Irregular : protected Pointers {
  int me,nprocs;
  int triclinic;
  int map_style;
-  int *procgrid;
-  int ***grid2proc;
+  int uniform;
+  double *xsplit,*ysplit,*zsplit;   // ptrs to comm
+  int *procgrid;                    // ptr to comm
+  int ***grid2proc;                 // ptr to comm
+  double *boxlo;                    // ptr to domain 
+  double *prd;                      // ptr to domain

  int maxsend,maxrecv;              // size of buffers in # of doubles
  double *buf_send,*buf_recv;
@ -81,6 +85,7 @@ class Irregular : protected Pointers {
  void exchange_atom(double *, int *, double *);
  void destroy_atom();
  int coord2proc(double *);
+  int binary(double, int, double *);

  void grow_send(int,int);          // reallocate send buffer
  void grow_recv(int);              // free/allocate recv buffer