git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7340 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-12-13 15:57:18 +00:00
parent 744efbc2cd
commit 86347b81e3
13 changed files with 1121 additions and 517 deletions
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -35,13 +35,11 @@
 #include "compute.h"
 #include "output.h"
 #include "dump.h"
+#include "procmap.h"
 #include "math_extra.h"
 #include "error.h"
 #include "memory.h"

-#include <map>
-#include <string>
-
 #ifdef _OPENMP
 #include "omp.h"
 #endif
@ -54,8 +52,9 @@ using namespace LAMMPS_NS;
 #define BIG 1.0e20

 enum{SINGLE,MULTI};
-enum{NONE,MULTIPLE};
-enum{CART,CARTREORDER,XYZ,XZY,YXZ,YZX,ZXY,ZYX};
+enum{MULTIPLE};                   // same as in ProcMap
+enum{ONELEVEL,NUMA,CUSTOM};
+enum{CART,CARTREORDER,XYZ};

 /* ----------------------------------------------------------------------
   setup MPI and allocate buffer space 
@ -67,9 +66,14 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  MPI_Comm_size(world,&nprocs);

  user_procgrid[0] = user_procgrid[1] = user_procgrid[2] = 0;
+  gridflag = ONELEVEL;
+  mapflag = CART;
+  customfile = NULL;
+  recv_from_partition = send_to_partition = -1;
+  otherflag = 0;
+  outfile = NULL;
+
  grid2proc = NULL;
-  layoutflag = CART;
-  numa_nodes = 0;

  bordergroup = 0;
  style = SINGLE;
@ -77,8 +81,6 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  cutghostmulti = NULL;
  cutghostuser = 0.0;
  ghost_velocity = 0;
-  recv_from_partition = send_to_partition = -1;
-  other_partition_style = NONE;

  // use of OpenMP threads
  // query OpenMP for number of threads/process set by user at run-time
@ -121,7 +123,10 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)

 Comm::~Comm()
 {
-  if (grid2proc) memory->destroy(grid2proc);
+  delete [] customfile;
+  delete [] outfile;
+
+  memory->destroy(grid2proc);

  free_swap();
  if (style == MULTI) {
@ -138,12 +143,13 @@ Comm::~Comm()
 }

 /* ----------------------------------------------------------------------
-   create 3d grid of procs based on box size
+   create 3d grid of procs based on Nprocs and box size & shape
+   map processors to grid
 ------------------------------------------------------------------------- */

 void Comm::set_proc_grid()
 {
-  // recv proc layout of another partition if my layout depends on it
+  // recv 3d proc grid of another partition if my 3d grid depends on it

  if (recv_from_partition >= 0) {
    MPI_Status status;
@ -153,124 +159,96 @@ void Comm::set_proc_grid()
    MPI_Bcast(other_procgrid,3,MPI_INT,0,world);
  }

-  // use NUMA routines if numa_nodes > 0
-  // if NUMA routines fail, just continue
+  // create ProcMap class

-  if (numa_nodes) {
-    int flag = numa_set_proc_grid();
-    if (flag) return;
+  ProcMap *pmap = new ProcMap(lmp);
+
+  // create 3d grid of processors, produces procgrid
+  // can fail (on one partition) if constrained by other partition
+  // if numa_grid() fails, try onelevel_grid()
+
+  int flag;
+  if (gridflag == ONELEVEL) {
+    flag = pmap->onelevel_grid(nprocs,user_procgrid,procgrid,
+			       otherflag,other_style,other_procgrid);
+    if (!flag) error->all(FLERR,"Could not create grid of processors");
+
+  } else if (gridflag == NUMA) {
+    flag = pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
+    if (!flag) {
+      coregrid[0] = coregrid[1] = coregrid[2] = 1;
+      flag = pmap->onelevel_grid(nprocs,user_procgrid,procgrid,
+				 otherflag,other_style,other_procgrid);
+    }
+    if (!flag) error->all(FLERR,"Could not create grid of processors");
+
+  } else if (gridflag == CUSTOM) {
+    pmap->custom_grid(nprocs,user_procgrid,procgrid);
  }

-  // create layout of procs mapped to simulation box
-  // can fail (on one partition) if constrained by other_partition_style
-
-  int flag = 
-    procs2box(nprocs,user_procgrid,procgrid,1,1,1,other_partition_style);
-  if (!flag) error->all(FLERR,"Could not layout grid of processors",1);
-
-  // error check
+  // error check on procgrid

  if (procgrid[0]*procgrid[1]*procgrid[2] != nprocs)
    error->all(FLERR,"Bad grid of processors");
  if (domain->dimension == 2 && procgrid[2] != 1)
    error->all(FLERR,"Processor count in z must be 1 for 2d simulation");

-  // grid2proc[i][j][k] = proc that owns i,j,k location in grid
+  // grid2proc[i][j][k] = proc that owns i,j,k location in 3d grid

  if (grid2proc) memory->destroy(grid2proc);
  memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
 		 "comm:grid2proc");

-  // use MPI Cartesian routines to layout 3d grid of procs
-  // MPI may do layout in machine-optimized fashion
+  // map processor IDs to 3d processor grid
+  // produces myloc, procneigh, grid2proc

-  if (layoutflag == CART || layoutflag == CARTREORDER) {
-    int reorder = 0;
-    if (layoutflag == CARTREORDER) reorder = 1;
-    int periods[3];
-    periods[0] = periods[1] = periods[2] = 1;
-    MPI_Comm cartesian;
+  if (gridflag == ONELEVEL) {
+    if (mapflag == CART)
+      pmap->cart_map(0,procgrid,myloc,procneigh,grid2proc);
+    else if (mapflag == CARTREORDER)
+      pmap->cart_map(1,procgrid,myloc,procneigh,grid2proc);
+    else if (mapflag == XYZ)
+      pmap->xyz_map(xyz,procgrid,myloc,procneigh,grid2proc);

-    MPI_Cart_create(world,3,procgrid,periods,reorder,&cartesian);
-    MPI_Cart_get(cartesian,3,procgrid,periods,myloc);
-    MPI_Cart_shift(cartesian,0,1,&procneigh[0][0],&procneigh[0][1]);
-    MPI_Cart_shift(cartesian,1,1,&procneigh[1][0],&procneigh[1][1]);
-    MPI_Cart_shift(cartesian,2,1,&procneigh[2][0],&procneigh[2][1]);
+  } else if (gridflag == NUMA) {
+    pmap->numa_map(coregrid,myloc,procneigh,grid2proc);

-    int coords[3];
-    int i,j,k;
-    for (i = 0; i < procgrid[0]; i++)
-      for (j = 0; j < procgrid[1]; j++)
-	for (k = 0; k < procgrid[2]; k++) {
-	  coords[0] = i; coords[1] = j; coords[2] = k;
-	  MPI_Cart_rank(cartesian,coords,&grid2proc[i][j][k]);
+  } else if (gridflag == CUSTOM) {
+    pmap->custom_map(myloc,procneigh,grid2proc);
  }

-    MPI_Comm_free(&cartesian);
+  // print 3d grid info to screen and logfile

-  // layout 3d grid of procs explicitly, via user-requested XYZ ordering
-
-  } else {
-    int i,j,k,ime,jme,kme;
-    for (i = 0; i < procgrid[0]; i++)
-      for (j = 0; j < procgrid[1]; j++)
-	for (k = 0; k < procgrid[2]; k++) {
-	  if (layoutflag == XYZ)
-	    grid2proc[i][j][k] = k*procgrid[1]*procgrid[0] + j*procgrid[0] + i;
-	  else if (layoutflag == XZY)
-	    grid2proc[i][j][k] = j*procgrid[2]*procgrid[0] + k*procgrid[0] + i;
-	  else if (layoutflag == YXZ)
-	    grid2proc[i][j][k] = k*procgrid[0]*procgrid[1] + i*procgrid[1] + j;
-	  else if (layoutflag == YZX)
-	    grid2proc[i][j][k] = i*procgrid[2]*procgrid[1] + k*procgrid[1] + j;
-	  else if (layoutflag == ZXY)
-	    grid2proc[i][j][k] = j*procgrid[0]*procgrid[2] + i*procgrid[2] + k;
-	  else if (layoutflag == ZYX)
-	    grid2proc[i][j][k] = i*procgrid[1]*procgrid[2] + j*procgrid[2] + k;
-
-	  if (grid2proc[i][j][k] == me) {
-	    ime = i; jme = j, kme = k;
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"  %d by %d by %d MPI processor grid\n",
+	      procgrid[0],procgrid[1],procgrid[2]);
+      if (gridflag == NUMA) 
+	fprintf(screen,"  %d by %d by %d core grid within node\n",
+		coregrid[0],coregrid[1],coregrid[2]);
+    }
+    if (logfile) {
+      fprintf(logfile,"  %d by %d by %d MPI processor grid\n",
+	      procgrid[0],procgrid[1],procgrid[2]);
+      if (gridflag == NUMA) 
+	fprintf(logfile,"  %d by %d by %d core grid within node\n",
+		coregrid[0],coregrid[1],coregrid[2]);
    }
  }

-    myloc[0] = ime;
-    myloc[1] = jme;
-    myloc[2] = kme;
+  // print 3d grid details to outfile

-    i = ime-1;
-    if (i < 0) i = procgrid[0]-1;
-    procneigh[0][0] = grid2proc[i][jme][kme];
-    i = ime+1;
-    if (i == procgrid[0]) i = 0;
-    procneigh[0][1] = grid2proc[i][jme][kme];
-
-    j = jme-1;
-    if (j < 0) j = procgrid[1]-1;
-    procneigh[1][0] = grid2proc[ime][j][kme];
-    j = jme+1;
-    if (j == procgrid[1]) j = 0;
-    procneigh[1][1] = grid2proc[ime][j][kme];
-
-    k = kme-1;
-    if (k < 0) k = procgrid[2]-1;
-    procneigh[2][0] = grid2proc[ime][jme][k];
-    k = kme+1;
-    if (k == procgrid[2]) k = 0;
-    procneigh[2][1] = grid2proc[ime][jme][k];
-  }
+  if (outfile) pmap->output(grid2proc,outfile);

  // set lamda box params after procs are assigned

  if (domain->triclinic) domain->set_lamda_box();

-  if (me == 0) {
-    if (screen) fprintf(screen,"  %d by %d by %d MPI processor grid\n",
-			procgrid[0],procgrid[1],procgrid[2]);
-    if (logfile) fprintf(logfile,"  %d by %d by %d MPI processor grid\n",
-			 procgrid[0],procgrid[1],procgrid[2]);
-  }
+  // free ProcMap class

-  // send my proc layout to another partition if requested
+  delete pmap;
+
+  // send my 3d proc grid to another partition if requested

  if (send_to_partition >= 0) {
    if (me == 0) MPI_Send(procgrid,3,MPI_INT,
@ -1196,130 +1174,6 @@ void Comm::reverse_comm_dump(Dump *dump)
  }
 }

-/* ----------------------------------------------------------------------
-   assign numprocs to 3d box so as to minimize surface area
-   area = surface area of each of 3 faces of simulation box divided by sx,sy,sz
-   for triclinic, area = cross product of 2 edge vectors stored in h matrix
-   valid assignment will be factorization of numprocs = Px by Py by Pz
-   user_factors = if non-zero, factors are specified by user
-   sx,sy,sz = scale box xyz dimension vy dividing by sx,sy,sz
-   other = 1 to enforce compatability with other partition's layout
-   return factors = # of procs assigned to each dimension
-   return 1 if successully factor, 0 if not
------------------------------------------------------------------------- */
-
-int Comm::procs2box(int numprocs, int user_factors[3], int factors[3], 
-		    const int sx, const int sy, const int sz, int other)
-{
-  factors[0] = user_factors[0];
-  factors[1] = user_factors[1];
-  factors[2] = user_factors[2];
-
-  // all 3 proc counts are specified
-
-  if (factors[0] && factors[1] && factors[2]) return 1;
-
-  // 2 out of 3 proc counts are specified
-
-  if (factors[0] > 0 && factors[1] > 0) {
-    factors[2] = nprocs/(factors[0]*factors[1]);
-    return 1;
-  } else if (factors[0] > 0 && factors[2] > 0) {
-    factors[1] = nprocs/(factors[0]*factors[2]);
-    return 1;
-  } else if (factors[1] > 0 && factors[2] > 0) {
-    factors[0] = nprocs/(factors[1]*factors[2]);
-    return 1;
-  } 
-
-  // determine cross-sectional areas for orthogonal and triclinic boxes
-  // area[0] = xy, area[1] = xz, area[2] = yz
-
-  double area[3];
-  if (domain->triclinic == 0) {
-    area[0] = domain->xprd * domain->yprd / (sx * sy);
-    area[1] = domain->xprd * domain->zprd / (sx * sz);
-    area[2] = domain->yprd * domain->zprd / (sy * sz);
-  } else {
-    double *h = domain->h;
-    double a[3],b[3],c[3];
-    a[0] = h[0]; a[1] = 0.0; a[2] = 0.0;
-    b[0] = h[5]; b[1] = h[1]; b[2] = 0.0;
-    MathExtra::cross3(a,b,c);
-    area[0] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sy);
-    a[0] = h[0]; a[1] = 0.0; a[2] = 0.0;
-    b[0] = h[4]; b[1] = h[3]; b[2] = h[2];
-    MathExtra::cross3(a,b,c);
-    area[1] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sz);
-    a[0] = h[5]; a[1] = h[1]; a[2] = 0.0;
-    b[0] = h[4]; b[1] = h[3]; b[2] = h[2];
-    MathExtra::cross3(a,b,c);
-    area[2] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sy * sz);
-  }
-
-  double bestsurf = 2.0 * (area[0]+area[1]+area[2]);
-
-  // loop thru all possible factorizations of numprocs
-  // only consider valid cases that match procgrid settings
-  // surf = surface area of a proc sub-domain
-  // only consider cases that match user_factors & other_procgrid settings
-  // success = 1 if valid factoriztion is found
-  // may not be if other constraint is enforced
-
-  int ipx,ipy,ipz,valid;
-  double surf;
-  
-  int success = 0;
-  ipx = 1;
-  while (ipx <= numprocs) {
-    valid = 1;
-    if (user_factors[0] && ipx != user_factors[0]) valid = 0;
-    if (other == MULTIPLE && other_procgrid[0] % ipx) valid = 0;
-    if (numprocs % ipx) valid = 0;
-
-    if (!valid) {
-      ipx++;
-      continue;
-    }
-
-    ipy = 1;
-    while (ipy <= numprocs/ipx) {
-      valid = 1;
-      if (user_factors[1] && ipy != user_factors[1]) valid = 0;
-      if (other == MULTIPLE && other_procgrid[1] % ipy) valid = 0;
-      if ((numprocs/ipx) % ipy) valid = 0;
-      if (!valid) {
-	ipy++;
-	continue;
-      }
-      
-      ipz = numprocs/ipx/ipy;
-      valid = 1;
-      if (user_factors[2] && ipz != user_factors[2]) valid = 0;
-      if (other == MULTIPLE && other_procgrid[2] % ipz) valid = 0;
-      if (domain->dimension == 2 && ipz != 1) valid = 0;
-      if (!valid) {
-	ipy++;
-	continue;
-      }
-      
-      surf = area[0]/ipx/ipy + area[1]/ipx/ipz + area[2]/ipy/ipz;
-      if (surf < bestsurf) {
-	success = 1;
-	bestsurf = surf;
-	factors[0] = ipx;
-	factors[1] = ipy;
-	factors[2] = ipz;
-      }
-      ipy++;
-    }
-
-    ipx++;
-  }
-
-  return success;
-}
-
 /* ----------------------------------------------------------------------
   realloc the size of the send buffer as needed with BUFFACTOR & BUFEXTRA 
   if flag = 1, realloc
@ -1502,7 +1356,48 @@ void Comm::set_processors(int narg, char **arg)

  int iarg = 3;
  while (iarg < narg) {
-    if (strcmp(arg[iarg],"part") == 0) {
+    if (strcmp(arg[iarg],"grid") == 0) {
+      if (iarg+1 > narg) error->all(FLERR,"Illegal processors command");
+
+      if (strcmp(arg[iarg+1],"level1") == 0) {
+	gridflag = LEVEL1;
+	iarg += 2;
+	
+      if (strcmp(arg[iarg+1],"level2") == 0) {
+	gridflag = LEVEL2;
+	iarg += 2;
+	
+      } else if (strcmp(arg[iarg+1],"numa") == 0) {
+	gridflag = NUMA;
+	iarg += 2;
+	
+      } else if (strcmp(arg[iarg],"custom") == 0) {
+	if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
+	gridflag = CUSTOM;
+	delete [] customfile;
+	int n = strlen(arg[iarg+1]) + 1;
+	customfile = new char(n);
+	strcpy(customfile,arg[iarg+1]);
+	iarg += 2;
+
+      } else error->all(FLERR,"Illegal processors command");
+
+    } else if (strcmp(arg[iarg],"map") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
+      if (strcmp(arg[iarg+1],"cart") == 0) mapflag = CART;
+      else if (strcmp(arg[iarg+1],"cart/reorder") == 0) mapflag = CARTREORDER;
+      else if (strcmp(arg[iarg+1],"xyz") == 0 ||
+	       strcmp(arg[iarg+1],"xzy") == 0 ||
+	       strcmp(arg[iarg+1],"yxz") == 0 ||
+	       strcmp(arg[iarg+1],"yzx") == 0 ||
+	       strcmp(arg[iarg+1],"zxy") == 0 ||
+	       strcmp(arg[iarg+1],"zyx") == 0) {
+	mapflag = XYZ;
+	strcpy(xyz,arg[iarg+1]);
+      } else error->all(FLERR,"Illegal processors command");
+      iarg += 2;
+
+    } else if (strcmp(arg[iarg],"part") == 0) {
      if (iarg+4 > narg) error->all(FLERR,"Illegal processors command");
      if (universe->nworlds == 1)
 	error->all(FLERR,
@ -1527,250 +1422,32 @@ void Comm::set_processors(int narg, char **arg)
 		     "is already a receiver");
 	recv_from_partition = isend-1;
      }
+
+      // only receiver has otherflag dependency
+
      if (strcmp(arg[iarg+3],"multiple") == 0) {
-	if (universe->iworld == irecv-1) other_partition_style = MULTIPLE;
+	if (universe->iworld == irecv-1) {
+	  otherflag = 1;
+	  other_style = MULTIPLE;
+	}
      } else error->all(FLERR,"Illegal processors command");
      iarg += 4;

-    } else if (strcmp(arg[iarg],"grid") == 0) {
+    } else if (strcmp(arg[iarg],"out") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
-      if (strcmp(arg[iarg+1],"cart") == 0) layoutflag = CART;
-      else if (strcmp(arg[iarg+1],"cart/reorder") == 0)
-	layoutflag = CARTREORDER;
-      else if (strcmp(arg[iarg+1],"xyz") == 0) layoutflag = XYZ;
-      else if (strcmp(arg[iarg+1],"xzy") == 0) layoutflag = XZY;
-      else if (strcmp(arg[iarg+1],"yxz") == 0) layoutflag = YXZ;
-      else if (strcmp(arg[iarg+1],"yzx") == 0) layoutflag = YZX;
-      else if (strcmp(arg[iarg+1],"zxy") == 0) layoutflag = ZXY;
-      else if (strcmp(arg[iarg+1],"zyx") == 0) layoutflag = ZYX;
-      else error->all(FLERR,"Illegal processors command");
+      delete [] outfile;
+      int n = strlen(arg[iarg+1]) + 1;
+      outfile = new char(n);
+      strcpy(outfile,arg[iarg+1]);
      iarg += 2;

-    } else if (strcmp(arg[iarg],"numa") == 0) {
-      if (iarg+1 > narg) error->all(FLERR,"Illegal processors command");
-      numa_nodes = 1;
-      if (numa_nodes < 0) error->all(FLERR,"Illegal processors command");
-      iarg += 1;
-
    } else error->all(FLERR,"Illegal processors command");
  }

  // error check

-  if (numa_nodes) {
-    if (layoutflag != CART) 
-      error->all(FLERR,"Can only use processors numa "
-		 "with processors grid cart");
-    if (send_to_partition >= 0 || recv_from_partition >= 0) 
-      error->one(FLERR,"Cannot use processors numa with processors part");
-  }
-}
-
-/* ----------------------------------------------------------------------
-   setup 3d grid of procs based on box size, group neighbors by numa node
-   return 1 if successful, return 0 if not
------------------------------------------------------------------------- */
-
-int Comm::numa_set_proc_grid()
-{
-  // get names of all nodes
-
-  int name_length;
-  char node_name[MPI_MAX_PROCESSOR_NAME];
-  char node_names[MPI_MAX_PROCESSOR_NAME*nprocs];
-  MPI_Get_processor_name(node_name,&name_length);
-  MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
-                MPI_MAX_PROCESSOR_NAME,MPI_CHAR,world);
-  std::string node_string = std::string(node_name);
-  
-  // get number of procs per node
-
-  std::map<std::string,int> name_map;
-  std::map<std::string,int>::iterator np;
-  for (int i = 0; i < nprocs; i++) {
-    std::string i_string = std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
-    np = name_map.find(i_string);
-    if (np == name_map.end())
-      name_map[i_string] = 1;
-    else
-      np->second++;
-  }
-  int procs_per_node = name_map.begin()->second;
-  int procs_per_numa = procs_per_node / numa_nodes;
-  
-  // use non-numa mapping if any condition met
-  
-  if (procs_per_numa < 4 ||               // less than 4 procs per numa node
-      procs_per_node % numa_nodes != 0 || // no-op since numa_nodes = 1 for now
-      nprocs % procs_per_numa != 0 ||     // total procs not a multiple of node
-      nprocs <= procs_per_numa ||         // only 1 node used
-      user_procgrid[0] > 1 ||             // user specified grid dimension < 1
-      user_procgrid[1] > 1 ||             //    in any dimension
-      user_procgrid[2] > 1) {
-    if (me == 0) {
-      if (screen) fprintf(screen,"  1 by 1 by 1 Node grid\n");
-      if (logfile) fprintf(logfile,"  1 by 1 by 1 Node grid\n");
-    }
-    return 0;
-  }
-  
-  // user settings for the factorization per numa node - currently always zero
-
-  int user_numagrid[3];
-  user_numagrid[0] = user_numagrid[1] = user_numagrid[2] = 0;
-  
-  // if user specifies 1 for a proc grid dimension,
-  // also use 1 for the node grid dimension
-
-  if (user_procgrid[0] == 1) user_numagrid[0] = 1;
-  if (user_procgrid[1] == 1) user_numagrid[1] = 1;
-  if (user_procgrid[2] == 1) user_numagrid[2] = 1;
-  
-  // get an initial factorization for each numa node,
-  // if the user has not set the number of processors
-
-  int numagrid[3];
-  procs2box(procs_per_numa,user_numagrid,numagrid,1,1,1,0);
-  if (numagrid[0] * numagrid[1] * numagrid[2] != procs_per_numa)
-    error->all(FLERR,"Bad node grid of processors");
-  
-  // get a factorization for the grid of numa nodes
-
-  int node_count = nprocs / procs_per_numa;
-  procs2box(node_count,user_procgrid,procgrid,
-	    numagrid[0],numagrid[1],numagrid[2],0);
-  if (procgrid[0] * procgrid[1] * procgrid[2] != node_count)
-    error->all(FLERR,"Bad grid of processors");
-  
-  // repeat the numa node factorization using the subdomain sizes
-  // this will refine the factorization if the user specified the node layout
-
-  procs2box(procs_per_numa,user_numagrid,numagrid,
-	    procgrid[0],procgrid[1],procgrid[2],0);
-  if (numagrid[0]*numagrid[1]*numagrid[2] != procs_per_numa)
-    error->all(FLERR,"Bad Node grid of processors");
-  if (domain->dimension == 2 && (procgrid[2] != 1 || numagrid[2] != 1))
-    error->all(FLERR,"Processor count in z must be 1 for 2d simulation");
-  
-  // assign a unique id to each node
-
-  int node_num = 0, node_id = 0;
-  for (np = name_map.begin(); np != name_map.end(); ++np) {
-    if (np->first == node_string) node_id = node_num;
-    node_num++;
-  }
-  
-  // setup a per node communicator and find rank within
-
-  MPI_Comm node_comm;
-  MPI_Comm_split(world,node_id,0,&node_comm);  
-  int node_rank;
-  MPI_Comm_rank(node_comm,&node_rank);
-  
-  // setup a per numa communicator and find rank within
-
-  MPI_Comm numa_comm;
-  int local_numa = node_rank / procs_per_numa;
-  MPI_Comm_split(node_comm,local_numa,0,&numa_comm);     
-  int numa_rank;
-  MPI_Comm_rank(numa_comm,&numa_rank);
-  
-  // setup a communicator with the rank 0 procs from each numa node
-
-  MPI_Comm numa_leaders;
-  MPI_Comm_split(world,numa_rank,0,&numa_leaders);
-  
-  // use the MPI Cartesian routines to map the nodes to the grid
-  // could implement layoutflag as in non-NUMA case?
-
-  int reorder = 0;
-  int periods[3];
-  periods[0] = periods[1] = periods[2] = 1;
-  MPI_Comm cartesian;
-  if (numa_rank == 0) {
-    MPI_Cart_create(numa_leaders,3,procgrid,periods,reorder,&cartesian);
-    MPI_Cart_get(cartesian,3,procgrid,periods,myloc);
-  }
-  
-  // broadcast numa node location in grid to other procs in numa node
-
-  MPI_Bcast(myloc,3,MPI_INT,0,numa_comm);
-  
-  // get storage for the process mapping
-
-  if (grid2proc) memory->destroy(grid2proc);
-  memory->create(grid2proc,procgrid[0]*numagrid[0],procgrid[1]*numagrid[1],
-                 procgrid[2]*numagrid[2],"comm:grid2proc");
-  
-  // compute my location within the grid
-
-  int z_offset = numa_rank / (numagrid[0] * numagrid[1]);
-  int y_offset = (numa_rank % (numagrid[0] * numagrid[1]))/numagrid[0];
-  int x_offset = numa_rank % numagrid[0];
-  myloc[0] = myloc[0] * numagrid[0] + x_offset;
-  myloc[1] = myloc[1] * numagrid[1] + y_offset;
-  myloc[2] = myloc[2] * numagrid[2] + z_offset;
-  procgrid[0] *= numagrid[0];
-  procgrid[1] *= numagrid[1];
-  procgrid[2] *= numagrid[2];
-  
-  // allgather of locations to fill grid2proc
-
-  int **gridi;
-  memory->create(gridi,nprocs,3,"comm:gridi");
-  MPI_Allgather(&myloc,3,MPI_INT,gridi[0],3,MPI_INT,world);
-  for (int i = 0; i < nprocs; i++)
-    grid2proc[gridi[i][0]][gridi[i][1]][gridi[i][2]] = i;
-  memory->destroy(gridi);
-  
-  // get my neighbors
-
-  int minus, plus;
-  for (int i = 0; i < 3; i++) {
-    numa_shift(myloc[i],procgrid[i],minus,plus);
-    procneigh[i][0] = minus;
-    procneigh[i][1] = plus;
-  }
-  procneigh[0][0] = grid2proc[procneigh[0][0]][myloc[1]][myloc[2]];
-  procneigh[0][1] = grid2proc[procneigh[0][1]][myloc[1]][myloc[2]];
-  procneigh[1][0] = grid2proc[myloc[0]][procneigh[1][0]][myloc[2]];
-  procneigh[1][1] = grid2proc[myloc[0]][procneigh[1][1]][myloc[2]];
-  procneigh[2][0] = grid2proc[myloc[0]][myloc[1]][procneigh[2][0]];
-  procneigh[2][1] = grid2proc[myloc[0]][myloc[1]][procneigh[2][1]];
-  
-  if (numa_rank == 0) MPI_Comm_free(&cartesian);
-  MPI_Comm_free(&numa_leaders);
-  MPI_Comm_free(&numa_comm);
-  MPI_Comm_free(&node_comm);
-  
-  // set lamda box params after procs are assigned
-
-  if (domain->triclinic) domain->set_lamda_box();
-  
-  if (me == 0) {
-    if (screen) fprintf(screen,"  %d by %d by %d Node grid\n",
-			numagrid[0],numagrid[1],numagrid[2]);
-    if (logfile) fprintf(logfile,"  %d by %d by %d Node grid\n",
-			 numagrid[0],numagrid[1],numagrid[2]);
-    if (screen) fprintf(screen,"  %d by %d by %d processor grid\n",
-			procgrid[0],procgrid[1],procgrid[2]);
-    if (logfile) fprintf(logfile,"  %d by %d by %d processor grid\n",
-			 procgrid[0],procgrid[1],procgrid[2]);
-  }
-
-  return 1;
-}
-
-/* ----------------------------------------------------------------------
-   get the index to the neighboring processors in a dimension
------------------------------------------------------------------------- */
-
-void Comm::numa_shift(int myloc, int num_procs, int &minus, int &plus) 
-{
-  minus = myloc - 1;
-  if (minus < 0) minus = num_procs - 1;
-  plus = myloc + 1;
-  if (plus == num_procs) plus = 0;
+  if (gridflag == NUMA && mapflag != CART)
+    error->all(FLERR,"Processors grid numa and map choice are incompatible");
 }

 /* ----------------------------------------------------------------------
--- a/src/comm.h
+++ b/src/comm.h
@ -83,10 +83,16 @@ class Comm : protected Pointers {
  int comm_x_only,comm_f_only;      // 1 if only exchange x,f in for/rev comm
  int map_style;                    // non-0 if global->local mapping is done
  int bordergroup;                  // only communicate this group in borders
-  int layoutflag;                   // user-selected layout for 3d proc grid
-  int numa_nodes;                   // layout NUMA-aware 3d proc grid
+  int gridflag;                     // option for creating 3d grid
+  int mapflag;                      // option for mapping procs to 3d grid
+  char xyz[4];                      // xyz mapping of procs to 3d grid
+  char *customfile;                 // file with custom proc map
+  char *outfile;                    // proc grid/map output file

+  int otherflag;                    // 1 if this partition dependent on another
+  int other_style;                  // style of dependency
  int other_procgrid[3];            // proc layout of another partition
+  int coregrid[3];                  // procs layout of cores within a node

  int *firstrecv;                   // where to put 1st recv atom in each swap
  int **sendlist;                   // list of atoms to send in each swap
@ -97,8 +103,6 @@ class Comm : protected Pointers {
  int maxsend,maxrecv;              // current size of send/recv buffer
  int maxforward,maxreverse;        // max # of datums in forward/reverse comm
 
-  int procs2box(int, int[3], int[3],        // map procs to 3d box
-		const int, const int, const int, int);
  virtual void grow_send(int,int);          // reallocate send buffer
  virtual void grow_recv(int);              // free/allocate recv buffer
  virtual void grow_list(int, int);         // reallocate one sendlist
@ -107,9 +111,6 @@ class Comm : protected Pointers {
  virtual void allocate_multi(int);         // allocate multi arrays
  virtual void free_swap();                 // free swap arrays
  virtual void free_multi();                // free multi arrays
-
-  int numa_set_proc_grid();
-  void numa_shift(int, int, int &, int &);
 };

 }
--- a/src/error.cpp
+++ b/src/error.cpp
@ -69,12 +69,10 @@ void Error::universe_one(const char *file, int line, const char *str)
   called by all procs in one world
   close all output, screen, and log files in world
   insure all procs in world call, else will hang
-   if abort = 0 (default):
-     if only one world in universe calls, universe will hang
-   if abort = 1: force abort of entire universe if any world in universe calls
+   force MPI_Abort if running in multi-partition mode
 ------------------------------------------------------------------------- */

-void Error::all(const char *file, int line, const char *str, int abort)
+void Error::all(const char *file, int line, const char *str)
 {
  MPI_Barrier(world);

@ -90,7 +88,7 @@ void Error::all(const char *file, int line, const char *str, int abort)
  if (screen && screen != stdout) fclose(screen);
  if (logfile) fclose(logfile);

-  if (abort) MPI_Abort(world,1);
+  if (universe->nworlds > 1) MPI_Abort(universe->uworld,1);
  MPI_Finalize();
  exit(1);
 }
--- a/src/error.h
+++ b/src/error.h
@ -25,7 +25,7 @@ class Error : protected Pointers {
  void universe_all(const char *, int, const char *);
  void universe_one(const char *, int, const char *);

-  void all(const char *, int, const char *, int = 0);
+  void all(const char *, int, const char *);
  void one(const char *, int, const char *);
  void warning(const char *, int, const char *, int = 1);
  void message(const char *, int, char *, int = 1);
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@ -149,12 +149,12 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
      iarg += 2;
    } else if (strcmp(arg[iarg],"-reorder") == 0 || 
 	       strcmp(arg[iarg],"-r") == 0) {
-      if (iarg+2 > narg) 
+      if (iarg+3 > narg) 
 	error->universe_all(FLERR,"Invalid command-line argument");
      if (universe->existflag)
 	error->universe_all(FLERR,"Cannot use -reorder after -partition");
-      universe->reorder(arg[iarg+1]);
-      iarg += 2;
+      universe->reorder(arg[iarg+1],arg[iarg+2]);
+      iarg += 3;
    } else if (strcmp(arg[iarg],"-help") == 0 || 
 	       strcmp(arg[iarg],"-h") == 0) {
      if (iarg+1 > narg) 
--- a/src/pair_born_coul_wolf.cpp
+++ b/src/pair_born_coul_wolf.cpp
@ -0,0 +1,470 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Yongfeng Zhang (INL), yongfeng.zhang@inl.gov
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_born_coul_wolf.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "math_const.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairBornCoulWolf::PairBornCoulWolf(LAMMPS *lmp) : Pair(lmp) {}
+
+/* ---------------------------------------------------------------------- */
+
+PairBornCoulWolf::~PairBornCoulWolf()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+
+    memory->destroy(cut_lj);
+    memory->destroy(cut_ljsq);
+    memory->destroy(a);
+    memory->destroy(rho);
+    memory->destroy(sigma);
+    memory->destroy(c);
+    memory->destroy(d);
+    memory->destroy(rhoinv);
+    memory->destroy(born1);
+    memory->destroy(born2);
+    memory->destroy(born3);
+    memory->destroy(offset);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulWolf::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forceborn,factor_coul,factor_lj;
+  double prefactor;
+  double r,rexp;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double erfcc,erfcd,v_sh,dvdrr,e_self,qisq;
+
+  evdwl = ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+  double qqrd2e = force->qqrd2e;
+
+  // self and shifted coulombic energy
+
+  e_self = v_sh = 0.0; 
+  e_shift = erfc(alf*cut_coul)/cut_coul;
+  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
+    cut_coul; 
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+ 
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    qisq = qtmp*qtmp;
+    e_self = -(e_shift/2.0 + alf/MY_PIS) * qisq*qqrd2e;
+    if (evflag) ev_tally(i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0);
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_coul = factor_lj = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	factor_lj = special_lj[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r2inv = 1.0/rsq;
+
+	if (rsq < cut_coulsq) {
+	  r = sqrt(rsq);
+          prefactor = qqrd2e*qtmp*q[j]/r;
+          erfcc = erfc(alf*r); 
+          erfcd = exp(-alf*alf*r*r);
+          v_sh = (erfcc - e_shift*r) * prefactor; 
+          dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = dvdrr*rsq*prefactor;
+	  if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	} else forcecoul = 0.0;
+
+	if (rsq < cut_ljsq[itype][jtype]) {
+	  r6inv = r2inv*r2inv*r2inv;
+          r = sqrt(rsq);
+	  rexp = exp((sigma[itype][jtype]-r)*rhoinv[itype][jtype]);
+	  forceborn = born1[itype][jtype]*r*rexp - born2[itype][jtype]*r6inv
+	    + born3[itype][jtype]*r2inv*r6inv;
+	} else forceborn = 0.0;
+
+	fpair = (forcecoul + factor_lj*forceborn) * r2inv;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+	if (newton_pair || j < nlocal) {
+	  f[j][0] -= delx*fpair;
+	  f[j][1] -= dely*fpair;
+	  f[j][2] -= delz*fpair;
+	}
+
+	if (eflag) {
+	  if (rsq < cut_coulsq) {
+	    ecoul = v_sh;
+	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	  } else ecoul = 0.0;
+	  if (rsq < cut_ljsq[itype][jtype]) {
+	    evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv +
+	      d[itype][jtype]*r6inv*r2inv - offset[itype][jtype];
+	    evdwl *= factor_lj;
+	  } else evdwl = 0.0;
+	}
+
+	if (evflag) ev_tally(i,j,nlocal,newton_pair,
+			     evdwl,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+
+  memory->create(cut_lj,n+1,n+1,"pair:cut_lj");
+  memory->create(cut_ljsq,n+1,n+1,"pair:cut_ljsq");
+  memory->create(a,n+1,n+1,"pair:a");
+  memory->create(rho,n+1,n+1,"pair:rho");
+  memory->create(sigma,n+1,n+1,"pair:sigma");
+  memory->create(c,n+1,n+1,"pair:c");
+  memory->create(d,n+1,n+1,"pair:d");
+  memory->create(rhoinv,n+1,n+1,"pair:rhoinv");
+  memory->create(born1,n+1,n+1,"pair:born1");
+  memory->create(born2,n+1,n+1,"pair:born2");
+  memory->create(born3,n+1,n+1,"pair:born3");
+  memory->create(offset,n+1,n+1,"pair:offset");
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::settings(int narg, char **arg)
+{
+  if (narg < 2 || narg > 3) error->all(FLERR,"Illegal pair_style command");
+
+  alf = force->numeric(arg[0]);
+  cut_lj_global = force->numeric(arg[1]);
+  if (narg == 2) cut_coul = cut_lj_global;
+  else cut_coul = force->numeric(arg[2]);
+
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++)
+      for (j = i+1; j <= atom->ntypes; j++)
+	if (setflag[i][j]) cut_lj[i][j] = cut_lj_global;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::coeff(int narg, char **arg)
+{
+  if (narg < 7 || narg > 8) 
+    error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(arg[1],atom->ntypes,jlo,jhi);
+
+  double a_one = force->numeric(arg[2]);
+  double rho_one = force->numeric(arg[3]);
+  double sigma_one = force->numeric(arg[4]);
+  if (rho_one <= 0) error->all(FLERR,"Incorrect args for pair coefficients");
+  double c_one = force->numeric(arg[5]);
+  double d_one = force->numeric(arg[6]);
+
+  double cut_lj_one = cut_lj_global;
+  if (narg == 8) cut_lj_one = force->numeric(arg[7]);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      a[i][j] = a_one;
+      rho[i][j] = rho_one;
+      sigma[i][j] = sigma_one;
+      c[i][j] = c_one;
+      d[i][j] = d_one;
+      cut_lj[i][j] = cut_lj_one;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::init_style()
+{
+  if (!atom->q_flag)
+    error->all(FLERR,"Pair style born/coul/Wolf requires atom attribute q");
+
+  int irequest = neighbor->request(this);
+
+  cut_coulsq = cut_coul * cut_coul;
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairBornCoulWolf::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+
+  double cut = MAX(cut_lj[i][j],cut_coul);
+  cut_ljsq[i][j] = cut_lj[i][j] * cut_lj[i][j];
+
+  rhoinv[i][j] = 1.0/rho[i][j];
+  born1[i][j] = a[i][j]/rho[i][j];
+  born2[i][j] = 6.0*c[i][j];
+  born3[i][j] = 8.0*d[i][j];
+     
+  if (offset_flag) {
+    double rexp = exp(-cut_lj[i][j]/rho[i][j]);
+    offset[i][j] = a[i][j]*rexp - c[i][j]/pow(cut_lj[i][j],6.0)
+      + d[i][j]/pow(cut_lj[i][j],8.0);
+  } else offset[i][j] = 0.0;
+
+  cut_ljsq[j][i] = cut_ljsq[i][j];
+  a[j][i] = a[i][j];
+  c[j][i] = c[i][j];
+  d[j][i] = d[i][j];
+  rhoinv[j][i] = rhoinv[i][j];
+  sigma[j][i] = sigma[i][j];
+  born1[j][i] = born1[i][j];
+  born2[j][i] = born2[i][j];
+  born3[j][i] = born3[i][j];
+  offset[j][i] = offset[i][j];
+
+  return cut;
+}
+
+/* ----------------------------------------------------------------------
+  proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+	fwrite(&a[i][j],sizeof(double),1,fp);
+	fwrite(&rho[i][j],sizeof(double),1,fp);
+	fwrite(&sigma[i][j],sizeof(double),1,fp);
+	fwrite(&c[i][j],sizeof(double),1,fp);
+	fwrite(&d[i][j],sizeof(double),1,fp);
+	fwrite(&cut_lj[i][j],sizeof(double),1,fp);
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+  proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+	if (me == 0) {
+	  fread(&a[i][j],sizeof(double),1,fp);
+	  fread(&rho[i][j],sizeof(double),1,fp);
+	  fread(&sigma[i][j],sizeof(double),1,fp);
+	  fread(&c[i][j],sizeof(double),1,fp);
+	  fread(&d[i][j],sizeof(double),1,fp);
+	  fread(&cut_lj[i][j],sizeof(double),1,fp);
+	}
+	MPI_Bcast(&a[i][j],1,MPI_DOUBLE,0,world);
+	MPI_Bcast(&rho[i][j],1,MPI_DOUBLE,0,world);
+	MPI_Bcast(&sigma[i][j],1,MPI_DOUBLE,0,world);
+	MPI_Bcast(&c[i][j],1,MPI_DOUBLE,0,world);
+	MPI_Bcast(&d[i][j],1,MPI_DOUBLE,0,world);
+	MPI_Bcast(&cut_lj[i][j],1,MPI_DOUBLE,0,world);
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+  proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::write_restart_settings(FILE *fp)
+{
+  fwrite(&alf,sizeof(double),1,fp);
+  fwrite(&cut_lj_global,sizeof(double),1,fp);
+  fwrite(&cut_coul,sizeof(double),1,fp);
+  fwrite(&offset_flag,sizeof(int),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+  proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairBornCoulWolf::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&alf,sizeof(double),1,fp);
+    fread(&cut_lj_global,sizeof(double),1,fp);
+    fread(&cut_coul,sizeof(double),1,fp);
+    fread(&offset_flag,sizeof(int),1,fp);
+    fread(&mix_flag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&alf,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&cut_lj_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&cut_coul,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
+
+/* ----------------------------------------------------------------------
+   only the pair part is calculated here
+------------------------------------------------------------------------- */
+
+double PairBornCoulWolf::single(int i, int j, int itype, int jtype,
+				double rsq, 
+				double factor_coul, double factor_lj,
+				double &fforce)
+{
+  double r2inv,r6inv,r,prefactor,rexp;
+  double forcecoul,forceborn,phicoul,phiborn;
+  double e_shift,f_shift,dvdrr,erfcc,erfcd; 
+
+  r2inv = 1.0/rsq;
+  e_shift = erfc(alf*cut_coul) / cut_coul;
+  f_shift = -(e_shift+2*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
+    cut_coul; 
+ 
+  if (rsq < cut_coulsq) {
+    r = sqrt(rsq);
+    prefactor = force->qqrd2e * atom->q[i]*atom->q[j]/r;
+    erfcc = erfc(alf*r); 
+    erfcd = exp(-alf*alf*r*r);
+    dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
+    forcecoul = dvdrr*rsq*prefactor;
+    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+  } else forcecoul = 0.0;
+
+  if (rsq < cut_ljsq[itype][jtype]) {
+    r6inv = r2inv*r2inv*r2inv;
+    r = sqrt(rsq);
+    rexp = exp(-r*rhoinv[itype][jtype]);
+    forceborn = born1[itype][jtype]*r*rexp - born2[itype][jtype]*r6inv
+      + born3[itype][jtype]*r2inv*r6inv;
+  } else forceborn = 0.0;
+
+  fforce = (forcecoul + factor_lj*forceborn) * r2inv;
+  
+  double eng = 0.0;
+  if (rsq < cut_coulsq) {
+    phicoul = prefactor * (erfcc-e_shift*r);
+    if (factor_coul < 1.0) phicoul -= (1.0-factor_coul)*prefactor;
+    eng += phicoul;
+  }
+  if (rsq < cut_ljsq[itype][jtype]) {
+    phiborn = a[itype][jtype]*rexp - c[itype][jtype]*r6inv 
+      + d[itype][jtype]*r2inv*r6inv - offset[itype][jtype];
+    eng += factor_lj*phiborn;
+  }
+  return eng;
+}
--- a/src/pair_born_coul_wolf.h
+++ b/src/pair_born_coul_wolf.h
@ -0,0 +1,56 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(born/coul/wolf,PairBornCoulWolf)
+
+#else
+
+#ifndef LMP_PAIR_BORN_COUL_WOLF_H
+#define LMP_PAIR_BORN_COUL_WOLF_H
+
+#include "pair.h"
+
+namespace LAMMPS_NS {
+
+class PairBornCoulWolf : public Pair {
+ public:
+  PairBornCoulWolf(class LAMMPS *);
+  ~PairBornCoulWolf();
+  void compute(int, int);
+  void settings(int, char **);
+  void coeff(int, char **);
+  void init_style();
+  double init_one(int, int);
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  double single(int, int, int, int, double, double, double, double &);
+  
+ private:
+  double cut_lj_global;
+  double **cut_lj,**cut_ljsq;
+  double cut_coul,cut_coulsq;
+  double **a,**rho,**sigma,**c,**d;
+  double **rhoinv,**born1,**born2,**born3,**offset;
+  double alf,e_shift,f_shift;
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
--- a/src/pair_coul_wolf.cpp
+++ b/src/pair_coul_wolf.cpp
@ -0,0 +1,328 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Yongfeng Zhang (INL), yongfeng.zhang@inl.gov
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_coul_wolf.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "math_const.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulWolf::PairCoulWolf(LAMMPS *lmp) : Pair(lmp) {}
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulWolf::~PairCoulWolf()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCoulWolf::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
+  double rsq,forcecoul,factor_coul;
+  double prefactor;
+  double r,rexp;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double erfcc,erfcd,v_sh,dvdrr,e_self,qisq;
+
+  ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_coul = force->special_coul;
+  int newton_pair = force->newton_pair;
+  double qqrd2e = force->qqrd2e;
+
+  // self and shifted coulombic energy
+
+  e_self = v_sh = 0.0; 
+  e_shift = erfc(alf*cut_coul)/cut_coul;
+  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
+    cut_coul; 
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+ 
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    qisq = qtmp*qtmp;
+    e_self = -(e_shift/2.0 + alf/MY_PIS) * qisq*qqrd2e;
+    if (evflag) ev_tally(i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0);
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      if (j < nall) factor_coul  = 1.0;
+      else {
+	factor_coul = special_coul[j/nall];
+	j %= nall;
+      }
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cut_coulsq) {
+	r = sqrt(rsq);
+	prefactor = qqrd2e*qtmp*q[j]/r;
+	erfcc = erfc(alf*r); 
+	erfcd = exp(-alf*alf*r*r);
+	v_sh = (erfcc - e_shift*r) * prefactor; 
+	dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
+	forcecoul = dvdrr*rsq*prefactor;
+	if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+	fpair = forcecoul / rsq;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+	if (newton_pair || j < nlocal) {
+	  f[j][0] -= delx*fpair;
+	  f[j][1] -= dely*fpair;
+	  f[j][2] -= delz*fpair;
+	}
+
+	if (eflag) {
+	  if (rsq < cut_coulsq) {
+	    ecoul = v_sh;
+	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	  } else ecoul = 0.0;
+	}
+
+	if (evflag) ev_tally(i,j,nlocal,newton_pair,
+			     0.0,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+  
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+   unlike other pair styles,
+     there are no individual pair settings that these override
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::settings(int narg, char **arg)
+{
+  if (narg != 2) error->all(FLERR,"Illegal pair_style command");
+
+  alf = force->numeric(arg[0]);
+  cut_coul = force->numeric(arg[1]);
+}
+
+/* ----------------------------------------------------------------------
+   set cutoffs for one or more type pairs, optional 
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::coeff(int narg, char **arg)
+{
+  if (narg != 2) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(arg[1],atom->ntypes,jlo,jhi);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style 
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::init_style()
+{
+  if (!atom->q_flag)
+    error->all(FLERR,"Pair coul/wolf requires atom attribute q");
+
+  int irequest = neighbor->request(this);
+
+  cut_coulsq = cut_coul*cut_coul;
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairCoulWolf::init_one(int i, int j)
+{
+  return cut_coul;
+}
+
+/* ----------------------------------------------------------------------
+  proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++)
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+  proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+    }
+}
+
+/* ----------------------------------------------------------------------
+  proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::write_restart_settings(FILE *fp)
+{
+  fwrite(&alf,sizeof(double),1,fp);
+  fwrite(&cut_coul,sizeof(double),1,fp);
+  fwrite(&offset_flag,sizeof(int),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairCoulWolf::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&alf,sizeof(double),1,fp);
+    fread(&cut_coul,sizeof(double),1,fp);
+    fread(&offset_flag,sizeof(int),1,fp);
+    fread(&mix_flag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&alf,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&cut_coul,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
+
+/* ----------------------------------------------------------------------
+   only the pair part is calculated here
+------------------------------------------------------------------------- */
+
+double PairCoulWolf::single(int i, int j, int itype, int jtype, double rsq, 
+			    double factor_coul, double factor_lj,
+			    double &fforce)
+{
+  double r6inv,r,prefactor,rexp;
+  double forcecoul,forceborn,phicoul;
+  double e_shift,f_shift,dvdrr,erfcc,erfcd; 
+
+  e_shift = erfc(alf*cut_coul) / cut_coul;
+  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
+    cut_coul; 
+ 
+  if (rsq < cut_coulsq) {
+    r = sqrt(rsq);
+    prefactor = force->qqrd2e * atom->q[i]*atom->q[j]/r;
+    erfcc = erfc(alf*r); 
+    erfcd = exp(-alf*alf*r*r);
+    dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
+    forcecoul = dvdrr*rsq*prefactor;
+    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+  } else forcecoul = 0.0;
+  fforce = forcecoul / rsq;
+  
+  double eng = 0.0;
+  if (rsq < cut_coulsq) {
+    phicoul = prefactor * (erfcc-e_shift*r);
+    if (factor_coul < 1.0) phicoul -= (1.0-factor_coul)*prefactor;
+    eng += phicoul;
+  }
+  return eng;
+}
--- a/src/pair_coul_wolf.h
+++ b/src/pair_coul_wolf.h
@ -0,0 +1,52 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(coul/wolf,PairCoulWolf)
+
+#else
+
+#ifndef LMP_PAIR_COUL_WOLF_H
+#define LMP_PAIR_COUL_WOLF_H
+
+#include "pair.h"
+
+namespace LAMMPS_NS {
+
+class PairCoulWolf : public Pair {
+ public:
+  PairCoulWolf(class LAMMPS *);
+  ~PairCoulWolf();
+  void compute(int, int);
+  void settings(int, char **);
+  void coeff(int, char **);
+  void init_style();
+  double init_one(int, int);
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  double single(int, int, int, int, double, double, double, double &);
+
+ private:
+  double cut_coul,cut_coulsq;
+  double alf,e_shift,f_shift; 
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
--- a/src/pair_lj_cut.cpp
+++ b/src/pair_lj_cut.cpp
@ -446,7 +446,8 @@ void PairLJCut::settings(int narg, char **arg)

 void PairLJCut::coeff(int narg, char **arg)
 {
-  if (narg < 4 || narg > 5) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (narg < 4 || narg > 5) 
+    error->all(FLERR,"Incorrect args for pair coefficients");
  if (!allocated) allocate();

  int ilo,ihi,jlo,jhi;
--- a/src/universe.cpp
+++ b/src/universe.cpp
@ -59,22 +59,38 @@ Universe::~Universe()
 }

 /* ----------------------------------------------------------------------
-   reorder universe processors based on custom file
+   reorder universe processors
+   create uni2orig as inverse mapping
+   re-create uworld communicator with new ordering via Comm_split()
+   style = "nth", arg = N
+   move every Nth proc to end of rankings
+   style = "custom", arg = filename
   file has nprocs lines with I J
   I = universe proc ID in original communicator uorig
   J = universe proc ID in reordered communicator uworld
-   create uni2orig as inverse mapping
-   re-create uworld communicator with new ordering via Comm_split()
 ------------------------------------------------------------------------- */

-void Universe::reorder(char *file)
+void Universe::reorder(char *style, char *arg)
 {
  char line[MAXLINE];

  if (uworld != uorig) MPI_Comm_free(&uworld);

+  if (strcmp(style,"nth") == 0) {
+    int n = atoi(arg);
+    if (n <= 0) 
+      error->universe_all(FLERR,"Invalid -reorder N value");
+    if (nprocs % n) 
+      error->universe_all(FLERR,"Nprocs not a multiple of N for -reorder");
+    for (int i = 0; i < nprocs; i++) {
+      if (i < (n-1)*nprocs/n) uni2orig[i] = i/(n-1) * n + (i % (n-1));
+      else uni2orig[i] = (i - (n-1)*nprocs/n) * n + n-1;
+    }
+
+  } else if (strcmp(style,"custom") == 0) {
+
    if (me == 0) {
-    FILE *fp = fopen(file,"r");
+      FILE *fp = fopen(arg,"r");
      if (fp == NULL) error->universe_one(FLERR,"Cannot open -reorder file");

      // skip header = blank and comment lines
@ -102,7 +118,7 @@ void Universe::reorder(char *file)
      for (int i = 1; i < nprocs; i++) {
 	if (!fgets(line,MAXLINE,fp))
 	  error->one(FLERR,"Unexpected end of reorder file");
-      sscanf(line,"%ld %ld",&me_orig,&me_new);
+	sscanf(line,"%d %d",&me_orig,&me_new);
 	if (me_orig < 0 || me_orig >= nprocs ||
 	    me_new < 0 || me_new >= nprocs)
 	  error->one(FLERR,"Invalid entry in reorder file");
@ -111,8 +127,14 @@ void Universe::reorder(char *file)
      fclose(fp);
    }

+    // bcast uni2org from proc 0 to all other universe procs
+    
    MPI_Bcast(uni2orig,nprocs,MPI_INT,0,uorig);

+  } else error->universe_all(FLERR,"Invalid command-line argument");
+
+  // create new uworld communicator
+
  int ome,key;
  MPI_Comm_rank(uorig,&ome);
  for (int i = 0; i < nprocs; i++)
--- a/src/universe.h
+++ b/src/universe.h
@ -41,7 +41,7 @@ class Universe : protected Pointers {

  Universe(class LAMMPS *, MPI_Comm);
  ~Universe();
-  void reorder(char *);
+  void reorder(char *, char *);
  void add_world(char *);
  int consistent();
 };
--- a/src/variable.cpp
+++ b/src/variable.cpp
@ -29,11 +29,10 @@
 #include "output.h"
 #include "thermo.h"
 #include "random_mars.h"
+#include "math_const.h"
 #include "memory.h"
 #include "error.h"

-#include "math_const.h"
-
 using namespace LAMMPS_NS;
 using namespace MathConst;