git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7340 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-12-13 15:57:18 +00:00
parent 744efbc2cd
commit 86347b81e3
13 changed files with 1121 additions and 517 deletions
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -35,13 +35,11 @@
 #include "compute.h"
 #include "output.h"
 #include "dump.h"
 #include "procmap.h"
 #include "math_extra.h"
 #include "error.h"
 #include "memory.h"
 #include <map>
 #include <string>
 #ifdef _OPENMP
 #include "omp.h"
 #endif
@ -54,8 +52,9 @@ using namespace LAMMPS_NS;
 #define BIG 1.0e20
 enum{SINGLE,MULTI};
-enum{NONE,MULTIPLE};
+enum{MULTIPLE};                   // same as in ProcMap
-enum{CART,CARTREORDER,XYZ,XZY,YXZ,YZX,ZXY,ZYX};
+enum{ONELEVEL,NUMA,CUSTOM};
 enum{CART,CARTREORDER,XYZ};
 /* ----------------------------------------------------------------------
   setup MPI and allocate buffer space 
@ -67,9 +66,14 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  MPI_Comm_size(world,&nprocs);
  user_procgrid[0] = user_procgrid[1] = user_procgrid[2] = 0;
  gridflag = ONELEVEL;
  mapflag = CART;
  customfile = NULL;
  recv_from_partition = send_to_partition = -1;
  otherflag = 0;
  outfile = NULL;
  grid2proc = NULL;
  layoutflag = CART;
  numa_nodes = 0;
  bordergroup = 0;
  style = SINGLE;
@ -77,8 +81,6 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
  cutghostmulti = NULL;
  cutghostuser = 0.0;
  ghost_velocity = 0;
  recv_from_partition = send_to_partition = -1;
  other_partition_style = NONE;
  // use of OpenMP threads
  // query OpenMP for number of threads/process set by user at run-time
@ -121,7 +123,10 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
 Comm::~Comm()
 {
-  if (grid2proc) memory->destroy(grid2proc);
+  delete [] customfile;
  delete [] outfile;
  memory->destroy(grid2proc);
  free_swap();
  if (style == MULTI) {
@ -138,12 +143,13 @@ Comm::~Comm()
 }
 /* ----------------------------------------------------------------------
-   create 3d grid of procs based on box size
+   create 3d grid of procs based on Nprocs and box size & shape
   map processors to grid
 ------------------------------------------------------------------------- */
 void Comm::set_proc_grid()
 {
-  // recv proc layout of another partition if my layout depends on it
+  // recv 3d proc grid of another partition if my 3d grid depends on it
  if (recv_from_partition >= 0) {
    MPI_Status status;
@ -153,124 +159,96 @@ void Comm::set_proc_grid()
    MPI_Bcast(other_procgrid,3,MPI_INT,0,world);
  }
-  // use NUMA routines if numa_nodes > 0
+  // create ProcMap class
  // if NUMA routines fail, just continue
-  if (numa_nodes) {
+  ProcMap *pmap = new ProcMap(lmp);
-    int flag = numa_set_proc_grid();
+
-    if (flag) return;
+  // create 3d grid of processors, produces procgrid
  // can fail (on one partition) if constrained by other partition
  // if numa_grid() fails, try onelevel_grid()
  int flag;
  if (gridflag == ONELEVEL) {
    flag = pmap->onelevel_grid(nprocs,user_procgrid,procgrid,
 			       otherflag,other_style,other_procgrid);
    if (!flag) error->all(FLERR,"Could not create grid of processors");
  } else if (gridflag == NUMA) {
    flag = pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
    if (!flag) {
      coregrid[0] = coregrid[1] = coregrid[2] = 1;
      flag = pmap->onelevel_grid(nprocs,user_procgrid,procgrid,
 				 otherflag,other_style,other_procgrid);
    }
    if (!flag) error->all(FLERR,"Could not create grid of processors");
  } else if (gridflag == CUSTOM) {
    pmap->custom_grid(nprocs,user_procgrid,procgrid);
  }
-  // create layout of procs mapped to simulation box
+  // error check on procgrid
  // can fail (on one partition) if constrained by other_partition_style
  int flag = 
    procs2box(nprocs,user_procgrid,procgrid,1,1,1,other_partition_style);
  if (!flag) error->all(FLERR,"Could not layout grid of processors",1);
  // error check
  if (procgrid[0]*procgrid[1]*procgrid[2] != nprocs)
    error->all(FLERR,"Bad grid of processors");
  if (domain->dimension == 2 && procgrid[2] != 1)
    error->all(FLERR,"Processor count in z must be 1 for 2d simulation");
-  // grid2proc[i][j][k] = proc that owns i,j,k location in grid
+  // grid2proc[i][j][k] = proc that owns i,j,k location in 3d grid
  if (grid2proc) memory->destroy(grid2proc);
  memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
 		 "comm:grid2proc");
-  // use MPI Cartesian routines to layout 3d grid of procs
+  // map processor IDs to 3d processor grid
-  // MPI may do layout in machine-optimized fashion
+  // produces myloc, procneigh, grid2proc
-  if (layoutflag == CART || layoutflag == CARTREORDER) {
+  if (gridflag == ONELEVEL) {
-    int reorder = 0;
+    if (mapflag == CART)
-    if (layoutflag == CARTREORDER) reorder = 1;
+      pmap->cart_map(0,procgrid,myloc,procneigh,grid2proc);
-    int periods[3];
+    else if (mapflag == CARTREORDER)
-    periods[0] = periods[1] = periods[2] = 1;
+      pmap->cart_map(1,procgrid,myloc,procneigh,grid2proc);
-    MPI_Comm cartesian;
+    else if (mapflag == XYZ)
-      
+      pmap->xyz_map(xyz,procgrid,myloc,procneigh,grid2proc);
    MPI_Cart_create(world,3,procgrid,periods,reorder,&cartesian);
    MPI_Cart_get(cartesian,3,procgrid,periods,myloc);
    MPI_Cart_shift(cartesian,0,1,&procneigh[0][0],&procneigh[0][1]);
    MPI_Cart_shift(cartesian,1,1,&procneigh[1][0],&procneigh[1][1]);
    MPI_Cart_shift(cartesian,2,1,&procneigh[2][0],&procneigh[2][1]);
-    int coords[3];
+  } else if (gridflag == NUMA) {
-    int i,j,k;
+    pmap->numa_map(coregrid,myloc,procneigh,grid2proc);
    for (i = 0; i < procgrid[0]; i++)
      for (j = 0; j < procgrid[1]; j++)
 	for (k = 0; k < procgrid[2]; k++) {
 	  coords[0] = i; coords[1] = j; coords[2] = k;
 	  MPI_Cart_rank(cartesian,coords,&grid2proc[i][j][k]);
 	}
    MPI_Comm_free(&cartesian);
-  // layout 3d grid of procs explicitly, via user-requested XYZ ordering
+  } else if (gridflag == CUSTOM) {
-
+    pmap->custom_map(myloc,procneigh,grid2proc);
  } else {
    int i,j,k,ime,jme,kme;
    for (i = 0; i < procgrid[0]; i++)
      for (j = 0; j < procgrid[1]; j++)
 	for (k = 0; k < procgrid[2]; k++) {
 	  if (layoutflag == XYZ)
 	    grid2proc[i][j][k] = k*procgrid[1]*procgrid[0] + j*procgrid[0] + i;
 	  else if (layoutflag == XZY)
 	    grid2proc[i][j][k] = j*procgrid[2]*procgrid[0] + k*procgrid[0] + i;
 	  else if (layoutflag == YXZ)
 	    grid2proc[i][j][k] = k*procgrid[0]*procgrid[1] + i*procgrid[1] + j;
 	  else if (layoutflag == YZX)
 	    grid2proc[i][j][k] = i*procgrid[2]*procgrid[1] + k*procgrid[1] + j;
 	  else if (layoutflag == ZXY)
 	    grid2proc[i][j][k] = j*procgrid[0]*procgrid[2] + i*procgrid[2] + k;
 	  else if (layoutflag == ZYX)
 	    grid2proc[i][j][k] = i*procgrid[1]*procgrid[2] + j*procgrid[2] + k;
 	  if (grid2proc[i][j][k] == me) {
 	    ime = i; jme = j, kme = k;
 	  }
 	}
    myloc[0] = ime;
    myloc[1] = jme;
    myloc[2] = kme;
    i = ime-1;
    if (i < 0) i = procgrid[0]-1;
    procneigh[0][0] = grid2proc[i][jme][kme];
    i = ime+1;
    if (i == procgrid[0]) i = 0;
    procneigh[0][1] = grid2proc[i][jme][kme];
    j = jme-1;
    if (j < 0) j = procgrid[1]-1;
    procneigh[1][0] = grid2proc[ime][j][kme];
    j = jme+1;
    if (j == procgrid[1]) j = 0;
    procneigh[1][1] = grid2proc[ime][j][kme];
    k = kme-1;
    if (k < 0) k = procgrid[2]-1;
    procneigh[2][0] = grid2proc[ime][jme][k];
    k = kme+1;
    if (k == procgrid[2]) k = 0;
    procneigh[2][1] = grid2proc[ime][jme][k];
  }
  // print 3d grid info to screen and logfile
  if (me == 0) {
    if (screen) {
      fprintf(screen,"  %d by %d by %d MPI processor grid\n",
 	      procgrid[0],procgrid[1],procgrid[2]);
      if (gridflag == NUMA) 
 	fprintf(screen,"  %d by %d by %d core grid within node\n",
 		coregrid[0],coregrid[1],coregrid[2]);
    }
    if (logfile) {
      fprintf(logfile,"  %d by %d by %d MPI processor grid\n",
 	      procgrid[0],procgrid[1],procgrid[2]);
      if (gridflag == NUMA) 
 	fprintf(logfile,"  %d by %d by %d core grid within node\n",
 		coregrid[0],coregrid[1],coregrid[2]);
    }
  }
  // print 3d grid details to outfile
  if (outfile) pmap->output(grid2proc,outfile);
  // set lamda box params after procs are assigned
  if (domain->triclinic) domain->set_lamda_box();
-  if (me == 0) {
+  // free ProcMap class
    if (screen) fprintf(screen,"  %d by %d by %d MPI processor grid\n",
 			procgrid[0],procgrid[1],procgrid[2]);
    if (logfile) fprintf(logfile,"  %d by %d by %d MPI processor grid\n",
 			 procgrid[0],procgrid[1],procgrid[2]);
  }
-  // send my proc layout to another partition if requested
+  delete pmap;
  // send my 3d proc grid to another partition if requested
  if (send_to_partition >= 0) {
    if (me == 0) MPI_Send(procgrid,3,MPI_INT,
@ -1196,130 +1174,6 @@ void Comm::reverse_comm_dump(Dump *dump)
  }
 }
 /* ----------------------------------------------------------------------
   assign numprocs to 3d box so as to minimize surface area
   area = surface area of each of 3 faces of simulation box divided by sx,sy,sz
   for triclinic, area = cross product of 2 edge vectors stored in h matrix
   valid assignment will be factorization of numprocs = Px by Py by Pz
   user_factors = if non-zero, factors are specified by user
   sx,sy,sz = scale box xyz dimension vy dividing by sx,sy,sz
   other = 1 to enforce compatability with other partition's layout
   return factors = # of procs assigned to each dimension
   return 1 if successully factor, 0 if not
 ------------------------------------------------------------------------- */
 int Comm::procs2box(int numprocs, int user_factors[3], int factors[3], 
 		    const int sx, const int sy, const int sz, int other)
 {
  factors[0] = user_factors[0];
  factors[1] = user_factors[1];
  factors[2] = user_factors[2];
  // all 3 proc counts are specified
  if (factors[0] && factors[1] && factors[2]) return 1;
  // 2 out of 3 proc counts are specified
  if (factors[0] > 0 && factors[1] > 0) {
    factors[2] = nprocs/(factors[0]*factors[1]);
    return 1;
  } else if (factors[0] > 0 && factors[2] > 0) {
    factors[1] = nprocs/(factors[0]*factors[2]);
    return 1;
  } else if (factors[1] > 0 && factors[2] > 0) {
    factors[0] = nprocs/(factors[1]*factors[2]);
    return 1;
  } 
  // determine cross-sectional areas for orthogonal and triclinic boxes
  // area[0] = xy, area[1] = xz, area[2] = yz
  double area[3];
  if (domain->triclinic == 0) {
    area[0] = domain->xprd * domain->yprd / (sx * sy);
    area[1] = domain->xprd * domain->zprd / (sx * sz);
    area[2] = domain->yprd * domain->zprd / (sy * sz);
  } else {
    double *h = domain->h;
    double a[3],b[3],c[3];
    a[0] = h[0]; a[1] = 0.0; a[2] = 0.0;
    b[0] = h[5]; b[1] = h[1]; b[2] = 0.0;
    MathExtra::cross3(a,b,c);
    area[0] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sy);
    a[0] = h[0]; a[1] = 0.0; a[2] = 0.0;
    b[0] = h[4]; b[1] = h[3]; b[2] = h[2];
    MathExtra::cross3(a,b,c);
    area[1] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sx * sz);
    a[0] = h[5]; a[1] = h[1]; a[2] = 0.0;
    b[0] = h[4]; b[1] = h[3]; b[2] = h[2];
    MathExtra::cross3(a,b,c);
    area[2] = sqrt(c[0]*c[0] + c[1]*c[1] + c[2]*c[2]) / (sy * sz);
  }
  double bestsurf = 2.0 * (area[0]+area[1]+area[2]);
  // loop thru all possible factorizations of numprocs
  // only consider valid cases that match procgrid settings
  // surf = surface area of a proc sub-domain
  // only consider cases that match user_factors & other_procgrid settings
  // success = 1 if valid factoriztion is found
  // may not be if other constraint is enforced
  int ipx,ipy,ipz,valid;
  double surf;
  int success = 0;
  ipx = 1;
  while (ipx <= numprocs) {
    valid = 1;
    if (user_factors[0] && ipx != user_factors[0]) valid = 0;
    if (other == MULTIPLE && other_procgrid[0] % ipx) valid = 0;
    if (numprocs % ipx) valid = 0;
    if (!valid) {
      ipx++;
      continue;
    }
    ipy = 1;
    while (ipy <= numprocs/ipx) {
      valid = 1;
      if (user_factors[1] && ipy != user_factors[1]) valid = 0;
      if (other == MULTIPLE && other_procgrid[1] % ipy) valid = 0;
      if ((numprocs/ipx) % ipy) valid = 0;
      if (!valid) {
 	ipy++;
 	continue;
      }
      ipz = numprocs/ipx/ipy;
      valid = 1;
      if (user_factors[2] && ipz != user_factors[2]) valid = 0;
      if (other == MULTIPLE && other_procgrid[2] % ipz) valid = 0;
      if (domain->dimension == 2 && ipz != 1) valid = 0;
      if (!valid) {
 	ipy++;
 	continue;
      }
      surf = area[0]/ipx/ipy + area[1]/ipx/ipz + area[2]/ipy/ipz;
      if (surf < bestsurf) {
 	success = 1;
 	bestsurf = surf;
 	factors[0] = ipx;
 	factors[1] = ipy;
 	factors[2] = ipz;
      }
      ipy++;
    }
    ipx++;
  }
  return success;
 }
 /* ----------------------------------------------------------------------
   realloc the size of the send buffer as needed with BUFFACTOR & BUFEXTRA 
   if flag = 1, realloc
@ -1502,7 +1356,48 @@ void Comm::set_processors(int narg, char **arg)
  int iarg = 3;
  while (iarg < narg) {
-    if (strcmp(arg[iarg],"part") == 0) {
+    if (strcmp(arg[iarg],"grid") == 0) {
      if (iarg+1 > narg) error->all(FLERR,"Illegal processors command");
      if (strcmp(arg[iarg+1],"level1") == 0) {
 	gridflag = LEVEL1;
 	iarg += 2;
      if (strcmp(arg[iarg+1],"level2") == 0) {
 	gridflag = LEVEL2;
 	iarg += 2;
      } else if (strcmp(arg[iarg+1],"numa") == 0) {
 	gridflag = NUMA;
 	iarg += 2;
      } else if (strcmp(arg[iarg],"custom") == 0) {
 	if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
 	gridflag = CUSTOM;
 	delete [] customfile;
 	int n = strlen(arg[iarg+1]) + 1;
 	customfile = new char(n);
 	strcpy(customfile,arg[iarg+1]);
 	iarg += 2;
      } else error->all(FLERR,"Illegal processors command");
    } else if (strcmp(arg[iarg],"map") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
      if (strcmp(arg[iarg+1],"cart") == 0) mapflag = CART;
      else if (strcmp(arg[iarg+1],"cart/reorder") == 0) mapflag = CARTREORDER;
      else if (strcmp(arg[iarg+1],"xyz") == 0 ||
 	       strcmp(arg[iarg+1],"xzy") == 0 ||
 	       strcmp(arg[iarg+1],"yxz") == 0 ||
 	       strcmp(arg[iarg+1],"yzx") == 0 ||
 	       strcmp(arg[iarg+1],"zxy") == 0 ||
 	       strcmp(arg[iarg+1],"zyx") == 0) {
 	mapflag = XYZ;
 	strcpy(xyz,arg[iarg+1]);
      } else error->all(FLERR,"Illegal processors command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"part") == 0) {
      if (iarg+4 > narg) error->all(FLERR,"Illegal processors command");
      if (universe->nworlds == 1)
 	error->all(FLERR,
@ -1527,250 +1422,32 @@ void Comm::set_processors(int narg, char **arg)
 		     "is already a receiver");
 	recv_from_partition = isend-1;
      }
      // only receiver has otherflag dependency
      if (strcmp(arg[iarg+3],"multiple") == 0) {
-	if (universe->iworld == irecv-1) other_partition_style = MULTIPLE;
+	if (universe->iworld == irecv-1) {
 	  otherflag = 1;
 	  other_style = MULTIPLE;
 	}
      } else error->all(FLERR,"Illegal processors command");
      iarg += 4;
-    } else if (strcmp(arg[iarg],"grid") == 0) {
+    } else if (strcmp(arg[iarg],"out") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
-      if (strcmp(arg[iarg+1],"cart") == 0) layoutflag = CART;
+      delete [] outfile;
-      else if (strcmp(arg[iarg+1],"cart/reorder") == 0)
+      int n = strlen(arg[iarg+1]) + 1;
-	layoutflag = CARTREORDER;
+      outfile = new char(n);
-      else if (strcmp(arg[iarg+1],"xyz") == 0) layoutflag = XYZ;
+      strcpy(outfile,arg[iarg+1]);
      else if (strcmp(arg[iarg+1],"xzy") == 0) layoutflag = XZY;
      else if (strcmp(arg[iarg+1],"yxz") == 0) layoutflag = YXZ;
      else if (strcmp(arg[iarg+1],"yzx") == 0) layoutflag = YZX;
      else if (strcmp(arg[iarg+1],"zxy") == 0) layoutflag = ZXY;
      else if (strcmp(arg[iarg+1],"zyx") == 0) layoutflag = ZYX;
      else error->all(FLERR,"Illegal processors command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"numa") == 0) {
      if (iarg+1 > narg) error->all(FLERR,"Illegal processors command");
      numa_nodes = 1;
      if (numa_nodes < 0) error->all(FLERR,"Illegal processors command");
      iarg += 1;
    } else error->all(FLERR,"Illegal processors command");
  }
  // error check
-  if (numa_nodes) {
+  if (gridflag == NUMA && mapflag != CART)
-    if (layoutflag != CART) 
+    error->all(FLERR,"Processors grid numa and map choice are incompatible");
      error->all(FLERR,"Can only use processors numa "
 		 "with processors grid cart");
    if (send_to_partition >= 0 || recv_from_partition >= 0) 
      error->one(FLERR,"Cannot use processors numa with processors part");
  }
 }
 /* ----------------------------------------------------------------------
   setup 3d grid of procs based on box size, group neighbors by numa node
   return 1 if successful, return 0 if not
 ------------------------------------------------------------------------- */
 int Comm::numa_set_proc_grid()
 {
  // get names of all nodes
  int name_length;
  char node_name[MPI_MAX_PROCESSOR_NAME];
  char node_names[MPI_MAX_PROCESSOR_NAME*nprocs];
  MPI_Get_processor_name(node_name,&name_length);
  MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
                MPI_MAX_PROCESSOR_NAME,MPI_CHAR,world);
  std::string node_string = std::string(node_name);
  // get number of procs per node
  std::map<std::string,int> name_map;
  std::map<std::string,int>::iterator np;
  for (int i = 0; i < nprocs; i++) {
    std::string i_string = std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
    np = name_map.find(i_string);
    if (np == name_map.end())
      name_map[i_string] = 1;
    else
      np->second++;
  }
  int procs_per_node = name_map.begin()->second;
  int procs_per_numa = procs_per_node / numa_nodes;
  // use non-numa mapping if any condition met
  if (procs_per_numa < 4 ||               // less than 4 procs per numa node
      procs_per_node % numa_nodes != 0 || // no-op since numa_nodes = 1 for now
      nprocs % procs_per_numa != 0 ||     // total procs not a multiple of node
      nprocs <= procs_per_numa ||         // only 1 node used
      user_procgrid[0] > 1 ||             // user specified grid dimension < 1
      user_procgrid[1] > 1 ||             //    in any dimension
      user_procgrid[2] > 1) {
    if (me == 0) {
      if (screen) fprintf(screen,"  1 by 1 by 1 Node grid\n");
      if (logfile) fprintf(logfile,"  1 by 1 by 1 Node grid\n");
    }
    return 0;
  }
  // user settings for the factorization per numa node - currently always zero
  int user_numagrid[3];
  user_numagrid[0] = user_numagrid[1] = user_numagrid[2] = 0;
  // if user specifies 1 for a proc grid dimension,
  // also use 1 for the node grid dimension
  if (user_procgrid[0] == 1) user_numagrid[0] = 1;
  if (user_procgrid[1] == 1) user_numagrid[1] = 1;
  if (user_procgrid[2] == 1) user_numagrid[2] = 1;
  // get an initial factorization for each numa node,
  // if the user has not set the number of processors
  int numagrid[3];
  procs2box(procs_per_numa,user_numagrid,numagrid,1,1,1,0);
  if (numagrid[0] * numagrid[1] * numagrid[2] != procs_per_numa)
    error->all(FLERR,"Bad node grid of processors");
  // get a factorization for the grid of numa nodes
  int node_count = nprocs / procs_per_numa;
  procs2box(node_count,user_procgrid,procgrid,
 	    numagrid[0],numagrid[1],numagrid[2],0);
  if (procgrid[0] * procgrid[1] * procgrid[2] != node_count)
    error->all(FLERR,"Bad grid of processors");
  // repeat the numa node factorization using the subdomain sizes
  // this will refine the factorization if the user specified the node layout
  procs2box(procs_per_numa,user_numagrid,numagrid,
 	    procgrid[0],procgrid[1],procgrid[2],0);
  if (numagrid[0]*numagrid[1]*numagrid[2] != procs_per_numa)
    error->all(FLERR,"Bad Node grid of processors");
  if (domain->dimension == 2 && (procgrid[2] != 1 || numagrid[2] != 1))
    error->all(FLERR,"Processor count in z must be 1 for 2d simulation");
  // assign a unique id to each node
  int node_num = 0, node_id = 0;
  for (np = name_map.begin(); np != name_map.end(); ++np) {
    if (np->first == node_string) node_id = node_num;
    node_num++;
  }
  // setup a per node communicator and find rank within
  MPI_Comm node_comm;
  MPI_Comm_split(world,node_id,0,&node_comm);  
  int node_rank;
  MPI_Comm_rank(node_comm,&node_rank);
  // setup a per numa communicator and find rank within
  MPI_Comm numa_comm;
  int local_numa = node_rank / procs_per_numa;
  MPI_Comm_split(node_comm,local_numa,0,&numa_comm);     
  int numa_rank;
  MPI_Comm_rank(numa_comm,&numa_rank);
  // setup a communicator with the rank 0 procs from each numa node
  MPI_Comm numa_leaders;
  MPI_Comm_split(world,numa_rank,0,&numa_leaders);
  // use the MPI Cartesian routines to map the nodes to the grid
  // could implement layoutflag as in non-NUMA case?
  int reorder = 0;
  int periods[3];
  periods[0] = periods[1] = periods[2] = 1;
  MPI_Comm cartesian;
  if (numa_rank == 0) {
    MPI_Cart_create(numa_leaders,3,procgrid,periods,reorder,&cartesian);
    MPI_Cart_get(cartesian,3,procgrid,periods,myloc);
  }
  // broadcast numa node location in grid to other procs in numa node
  MPI_Bcast(myloc,3,MPI_INT,0,numa_comm);
  // get storage for the process mapping
  if (grid2proc) memory->destroy(grid2proc);
  memory->create(grid2proc,procgrid[0]*numagrid[0],procgrid[1]*numagrid[1],
                 procgrid[2]*numagrid[2],"comm:grid2proc");
  // compute my location within the grid
  int z_offset = numa_rank / (numagrid[0] * numagrid[1]);
  int y_offset = (numa_rank % (numagrid[0] * numagrid[1]))/numagrid[0];
  int x_offset = numa_rank % numagrid[0];
  myloc[0] = myloc[0] * numagrid[0] + x_offset;
  myloc[1] = myloc[1] * numagrid[1] + y_offset;
  myloc[2] = myloc[2] * numagrid[2] + z_offset;
  procgrid[0] *= numagrid[0];
  procgrid[1] *= numagrid[1];
  procgrid[2] *= numagrid[2];
  // allgather of locations to fill grid2proc
  int **gridi;
  memory->create(gridi,nprocs,3,"comm:gridi");
  MPI_Allgather(&myloc,3,MPI_INT,gridi[0],3,MPI_INT,world);
  for (int i = 0; i < nprocs; i++)
    grid2proc[gridi[i][0]][gridi[i][1]][gridi[i][2]] = i;
  memory->destroy(gridi);
  // get my neighbors
  int minus, plus;
  for (int i = 0; i < 3; i++) {
    numa_shift(myloc[i],procgrid[i],minus,plus);
    procneigh[i][0] = minus;
    procneigh[i][1] = plus;
  }
  procneigh[0][0] = grid2proc[procneigh[0][0]][myloc[1]][myloc[2]];
  procneigh[0][1] = grid2proc[procneigh[0][1]][myloc[1]][myloc[2]];
  procneigh[1][0] = grid2proc[myloc[0]][procneigh[1][0]][myloc[2]];
  procneigh[1][1] = grid2proc[myloc[0]][procneigh[1][1]][myloc[2]];
  procneigh[2][0] = grid2proc[myloc[0]][myloc[1]][procneigh[2][0]];
  procneigh[2][1] = grid2proc[myloc[0]][myloc[1]][procneigh[2][1]];
  if (numa_rank == 0) MPI_Comm_free(&cartesian);
  MPI_Comm_free(&numa_leaders);
  MPI_Comm_free(&numa_comm);
  MPI_Comm_free(&node_comm);
  // set lamda box params after procs are assigned
  if (domain->triclinic) domain->set_lamda_box();
  if (me == 0) {
    if (screen) fprintf(screen,"  %d by %d by %d Node grid\n",
 			numagrid[0],numagrid[1],numagrid[2]);
    if (logfile) fprintf(logfile,"  %d by %d by %d Node grid\n",
 			 numagrid[0],numagrid[1],numagrid[2]);
    if (screen) fprintf(screen,"  %d by %d by %d processor grid\n",
 			procgrid[0],procgrid[1],procgrid[2]);
    if (logfile) fprintf(logfile,"  %d by %d by %d processor grid\n",
 			 procgrid[0],procgrid[1],procgrid[2]);
  }
  return 1;
 }
 /* ----------------------------------------------------------------------
   get the index to the neighboring processors in a dimension
 ------------------------------------------------------------------------- */
 void Comm::numa_shift(int myloc, int num_procs, int &minus, int &plus) 
 {
  minus = myloc - 1;
  if (minus < 0) minus = num_procs - 1;
  plus = myloc + 1;
  if (plus == num_procs) plus = 0;
 }
 /* ----------------------------------------------------------------------
--- a/src/comm.h
+++ b/src/comm.h
@ -83,10 +83,16 @@ class Comm : protected Pointers {
  int comm_x_only,comm_f_only;      // 1 if only exchange x,f in for/rev comm
  int map_style;                    // non-0 if global->local mapping is done
  int bordergroup;                  // only communicate this group in borders
-  int layoutflag;                   // user-selected layout for 3d proc grid
+  int gridflag;                     // option for creating 3d grid
-  int numa_nodes;                   // layout NUMA-aware 3d proc grid
+  int mapflag;                      // option for mapping procs to 3d grid
  char xyz[4];                      // xyz mapping of procs to 3d grid
  char *customfile;                 // file with custom proc map
  char *outfile;                    // proc grid/map output file
  int otherflag;                    // 1 if this partition dependent on another
  int other_style;                  // style of dependency
  int other_procgrid[3];            // proc layout of another partition
  int coregrid[3];                  // procs layout of cores within a node
  int *firstrecv;                   // where to put 1st recv atom in each swap
  int **sendlist;                   // list of atoms to send in each swap
@ -97,8 +103,6 @@ class Comm : protected Pointers {
  int maxsend,maxrecv;              // current size of send/recv buffer
  int maxforward,maxreverse;        // max # of datums in forward/reverse comm
  int procs2box(int, int[3], int[3],        // map procs to 3d box
 		const int, const int, const int, int);
  virtual void grow_send(int,int);          // reallocate send buffer
  virtual void grow_recv(int);              // free/allocate recv buffer
  virtual void grow_list(int, int);         // reallocate one sendlist
@ -107,9 +111,6 @@ class Comm : protected Pointers {
  virtual void allocate_multi(int);         // allocate multi arrays
  virtual void free_swap();                 // free swap arrays
  virtual void free_multi();                // free multi arrays
  int numa_set_proc_grid();
  void numa_shift(int, int, int &, int &);
 };
 }
--- a/src/error.cpp
+++ b/src/error.cpp
@ -69,12 +69,10 @@ void Error::universe_one(const char *file, int line, const char *str)
   called by all procs in one world
   close all output, screen, and log files in world
   insure all procs in world call, else will hang
-   if abort = 0 (default):
+   force MPI_Abort if running in multi-partition mode
     if only one world in universe calls, universe will hang
   if abort = 1: force abort of entire universe if any world in universe calls
 ------------------------------------------------------------------------- */
-void Error::all(const char *file, int line, const char *str, int abort)
+void Error::all(const char *file, int line, const char *str)
 {
  MPI_Barrier(world);
@ -90,7 +88,7 @@ void Error::all(const char *file, int line, const char *str, int abort)
  if (screen && screen != stdout) fclose(screen);
  if (logfile) fclose(logfile);
-  if (abort) MPI_Abort(world,1);
+  if (universe->nworlds > 1) MPI_Abort(universe->uworld,1);
  MPI_Finalize();
  exit(1);
 }
--- a/src/error.h
+++ b/src/error.h
@ -25,7 +25,7 @@ class Error : protected Pointers {
  void universe_all(const char *, int, const char *);
  void universe_one(const char *, int, const char *);
-  void all(const char *, int, const char *, int = 0);
+  void all(const char *, int, const char *);
  void one(const char *, int, const char *);
  void warning(const char *, int, const char *, int = 1);
  void message(const char *, int, char *, int = 1);
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@ -149,12 +149,12 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
      iarg += 2;
    } else if (strcmp(arg[iarg],"-reorder") == 0 || 
 	       strcmp(arg[iarg],"-r") == 0) {
-      if (iarg+2 > narg) 
+      if (iarg+3 > narg) 
 	error->universe_all(FLERR,"Invalid command-line argument");
      if (universe->existflag)
 	error->universe_all(FLERR,"Cannot use -reorder after -partition");
-      universe->reorder(arg[iarg+1]);
+      universe->reorder(arg[iarg+1],arg[iarg+2]);
-      iarg += 2;
+      iarg += 3;
    } else if (strcmp(arg[iarg],"-help") == 0 || 
 	       strcmp(arg[iarg],"-h") == 0) {
      if (iarg+1 > narg) 
--- a/src/pair_born_coul_wolf.cpp
+++ b/src/pair_born_coul_wolf.cpp
@ -0,0 +1,470 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Yongfeng Zhang (INL), yongfeng.zhang@inl.gov
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_born_coul_wolf.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
 /* ---------------------------------------------------------------------- */
 PairBornCoulWolf::PairBornCoulWolf(LAMMPS *lmp) : Pair(lmp) {}
 /* ---------------------------------------------------------------------- */
 PairBornCoulWolf::~PairBornCoulWolf()
 {
  if (allocated) {
    memory->destroy(setflag);
    memory->destroy(cutsq);
    memory->destroy(cut_lj);
    memory->destroy(cut_ljsq);
    memory->destroy(a);
    memory->destroy(rho);
    memory->destroy(sigma);
    memory->destroy(c);
    memory->destroy(d);
    memory->destroy(rhoinv);
    memory->destroy(born1);
    memory->destroy(born2);
    memory->destroy(born3);
    memory->destroy(offset);
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairBornCoulWolf::compute(int eflag, int vflag)
 {
  int i,j,ii,jj,inum,jnum,itype,jtype;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
  double rsq,r2inv,r6inv,forcecoul,forceborn,factor_coul,factor_lj;
  double prefactor;
  double r,rexp;
  int *ilist,*jlist,*numneigh,**firstneigh;
  double erfcc,erfcd,v_sh,dvdrr,e_self,qisq;
  evdwl = ecoul = 0.0;
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_coul = force->special_coul;
  double *special_lj = force->special_lj;
  int newton_pair = force->newton_pair;
  double qqrd2e = force->qqrd2e;
  // self and shifted coulombic energy
  e_self = v_sh = 0.0; 
  e_shift = erfc(alf*cut_coul)/cut_coul;
  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
    cut_coul; 
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = 0; ii < inum; ii++) {
    i = ilist[ii];
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    qisq = qtmp*qtmp;
    e_self = -(e_shift/2.0 + alf/MY_PIS) * qisq*qqrd2e;
    if (evflag) ev_tally(i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0);
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_coul = factor_lj = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	factor_lj = special_lj[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cutsq[itype][jtype]) {
 	r2inv = 1.0/rsq;
 	if (rsq < cut_coulsq) {
 	  r = sqrt(rsq);
          prefactor = qqrd2e*qtmp*q[j]/r;
          erfcc = erfc(alf*r); 
          erfcd = exp(-alf*alf*r*r);
          v_sh = (erfcc - e_shift*r) * prefactor; 
          dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
          forcecoul = dvdrr*rsq*prefactor;
 	  if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
 	} else forcecoul = 0.0;
 	if (rsq < cut_ljsq[itype][jtype]) {
 	  r6inv = r2inv*r2inv*r2inv;
          r = sqrt(rsq);
 	  rexp = exp((sigma[itype][jtype]-r)*rhoinv[itype][jtype]);
 	  forceborn = born1[itype][jtype]*r*rexp - born2[itype][jtype]*r6inv
 	    + born3[itype][jtype]*r2inv*r6inv;
 	} else forceborn = 0.0;
 	fpair = (forcecoul + factor_lj*forceborn) * r2inv;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (newton_pair || j < nlocal) {
 	  f[j][0] -= delx*fpair;
 	  f[j][1] -= dely*fpair;
 	  f[j][2] -= delz*fpair;
 	}
 	if (eflag) {
 	  if (rsq < cut_coulsq) {
 	    ecoul = v_sh;
 	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
 	  } else ecoul = 0.0;
 	  if (rsq < cut_ljsq[itype][jtype]) {
 	    evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv +
 	      d[itype][jtype]*r6inv*r2inv - offset[itype][jtype];
 	    evdwl *= factor_lj;
 	  } else evdwl = 0.0;
 	}
 	if (evflag) ev_tally(i,j,nlocal,newton_pair,
 			     evdwl,ecoul,fpair,delx,dely,delz);
      }
    }
  }
  if (vflag_fdotr) virial_fdotr_compute();
 }
 /* ----------------------------------------------------------------------
   allocate all arrays
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::allocate()
 {
  allocated = 1;
  int n = atom->ntypes;
  memory->create(setflag,n+1,n+1,"pair:setflag");
  for (int i = 1; i <= n; i++)
    for (int j = i; j <= n; j++)
      setflag[i][j] = 0;
  memory->create(cutsq,n+1,n+1,"pair:cutsq");
  memory->create(cut_lj,n+1,n+1,"pair:cut_lj");
  memory->create(cut_ljsq,n+1,n+1,"pair:cut_ljsq");
  memory->create(a,n+1,n+1,"pair:a");
  memory->create(rho,n+1,n+1,"pair:rho");
  memory->create(sigma,n+1,n+1,"pair:sigma");
  memory->create(c,n+1,n+1,"pair:c");
  memory->create(d,n+1,n+1,"pair:d");
  memory->create(rhoinv,n+1,n+1,"pair:rhoinv");
  memory->create(born1,n+1,n+1,"pair:born1");
  memory->create(born2,n+1,n+1,"pair:born2");
  memory->create(born3,n+1,n+1,"pair:born3");
  memory->create(offset,n+1,n+1,"pair:offset");
 }
 /* ----------------------------------------------------------------------
   global settings
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::settings(int narg, char **arg)
 {
  if (narg < 2 || narg > 3) error->all(FLERR,"Illegal pair_style command");
  alf = force->numeric(arg[0]);
  cut_lj_global = force->numeric(arg[1]);
  if (narg == 2) cut_coul = cut_lj_global;
  else cut_coul = force->numeric(arg[2]);
  if (allocated) {
    int i,j;
    for (i = 1; i <= atom->ntypes; i++)
      for (j = i+1; j <= atom->ntypes; j++)
 	if (setflag[i][j]) cut_lj[i][j] = cut_lj_global;
  }
 }
 /* ----------------------------------------------------------------------
   set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::coeff(int narg, char **arg)
 {
  if (narg < 7 || narg > 8) 
    error->all(FLERR,"Incorrect args for pair coefficients");
  if (!allocated) allocate();
  int ilo,ihi,jlo,jhi;
  force->bounds(arg[0],atom->ntypes,ilo,ihi);
  force->bounds(arg[1],atom->ntypes,jlo,jhi);
  double a_one = force->numeric(arg[2]);
  double rho_one = force->numeric(arg[3]);
  double sigma_one = force->numeric(arg[4]);
  if (rho_one <= 0) error->all(FLERR,"Incorrect args for pair coefficients");
  double c_one = force->numeric(arg[5]);
  double d_one = force->numeric(arg[6]);
  double cut_lj_one = cut_lj_global;
  if (narg == 8) cut_lj_one = force->numeric(arg[7]);
  int count = 0;
  for (int i = ilo; i <= ihi; i++) {
    for (int j = MAX(jlo,i); j <= jhi; j++) {
      a[i][j] = a_one;
      rho[i][j] = rho_one;
      sigma[i][j] = sigma_one;
      c[i][j] = c_one;
      d[i][j] = d_one;
      cut_lj[i][j] = cut_lj_one;
      setflag[i][j] = 1;
      count++;
    }
  }
  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::init_style()
 {
  if (!atom->q_flag)
    error->all(FLERR,"Pair style born/coul/Wolf requires atom attribute q");
  int irequest = neighbor->request(this);
  cut_coulsq = cut_coul * cut_coul;
 }
 /* ----------------------------------------------------------------------
   init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 double PairBornCoulWolf::init_one(int i, int j)
 {
  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
  double cut = MAX(cut_lj[i][j],cut_coul);
  cut_ljsq[i][j] = cut_lj[i][j] * cut_lj[i][j];
  rhoinv[i][j] = 1.0/rho[i][j];
  born1[i][j] = a[i][j]/rho[i][j];
  born2[i][j] = 6.0*c[i][j];
  born3[i][j] = 8.0*d[i][j];
  if (offset_flag) {
    double rexp = exp(-cut_lj[i][j]/rho[i][j]);
    offset[i][j] = a[i][j]*rexp - c[i][j]/pow(cut_lj[i][j],6.0)
      + d[i][j]/pow(cut_lj[i][j],8.0);
  } else offset[i][j] = 0.0;
  cut_ljsq[j][i] = cut_ljsq[i][j];
  a[j][i] = a[i][j];
  c[j][i] = c[i][j];
  d[j][i] = d[i][j];
  rhoinv[j][i] = rhoinv[i][j];
  sigma[j][i] = sigma[i][j];
  born1[j][i] = born1[i][j];
  born2[j][i] = born2[i][j];
  born3[j][i] = born3[i][j];
  offset[j][i] = offset[i][j];
  return cut;
 }
 /* ----------------------------------------------------------------------
  proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::write_restart(FILE *fp)
 {
  write_restart_settings(fp);
  int i,j;
  for (i = 1; i <= atom->ntypes; i++)
    for (j = i; j <= atom->ntypes; j++) {
      fwrite(&setflag[i][j],sizeof(int),1,fp);
      if (setflag[i][j]) {
 	fwrite(&a[i][j],sizeof(double),1,fp);
 	fwrite(&rho[i][j],sizeof(double),1,fp);
 	fwrite(&sigma[i][j],sizeof(double),1,fp);
 	fwrite(&c[i][j],sizeof(double),1,fp);
 	fwrite(&d[i][j],sizeof(double),1,fp);
 	fwrite(&cut_lj[i][j],sizeof(double),1,fp);
      }
    }
 }
 /* ----------------------------------------------------------------------
  proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::read_restart(FILE *fp)
 {
  read_restart_settings(fp);
  allocate();
  int i,j;
  int me = comm->me;
  for (i = 1; i <= atom->ntypes; i++)
    for (j = i; j <= atom->ntypes; j++) {
      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
      if (setflag[i][j]) {
 	if (me == 0) {
 	  fread(&a[i][j],sizeof(double),1,fp);
 	  fread(&rho[i][j],sizeof(double),1,fp);
 	  fread(&sigma[i][j],sizeof(double),1,fp);
 	  fread(&c[i][j],sizeof(double),1,fp);
 	  fread(&d[i][j],sizeof(double),1,fp);
 	  fread(&cut_lj[i][j],sizeof(double),1,fp);
 	}
 	MPI_Bcast(&a[i][j],1,MPI_DOUBLE,0,world);
 	MPI_Bcast(&rho[i][j],1,MPI_DOUBLE,0,world);
 	MPI_Bcast(&sigma[i][j],1,MPI_DOUBLE,0,world);
 	MPI_Bcast(&c[i][j],1,MPI_DOUBLE,0,world);
 	MPI_Bcast(&d[i][j],1,MPI_DOUBLE,0,world);
 	MPI_Bcast(&cut_lj[i][j],1,MPI_DOUBLE,0,world);
      }
    }
 }
 /* ----------------------------------------------------------------------
  proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::write_restart_settings(FILE *fp)
 {
  fwrite(&alf,sizeof(double),1,fp);
  fwrite(&cut_lj_global,sizeof(double),1,fp);
  fwrite(&cut_coul,sizeof(double),1,fp);
  fwrite(&offset_flag,sizeof(int),1,fp);
  fwrite(&mix_flag,sizeof(int),1,fp);
 }
 /* ----------------------------------------------------------------------
  proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 void PairBornCoulWolf::read_restart_settings(FILE *fp)
 {
  if (comm->me == 0) {
    fread(&alf,sizeof(double),1,fp);
    fread(&cut_lj_global,sizeof(double),1,fp);
    fread(&cut_coul,sizeof(double),1,fp);
    fread(&offset_flag,sizeof(int),1,fp);
    fread(&mix_flag,sizeof(int),1,fp);
  }
  MPI_Bcast(&alf,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&cut_lj_global,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&cut_coul,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
 }
 /* ----------------------------------------------------------------------
   only the pair part is calculated here
 ------------------------------------------------------------------------- */
 double PairBornCoulWolf::single(int i, int j, int itype, int jtype,
 				double rsq, 
 				double factor_coul, double factor_lj,
 				double &fforce)
 {
  double r2inv,r6inv,r,prefactor,rexp;
  double forcecoul,forceborn,phicoul,phiborn;
  double e_shift,f_shift,dvdrr,erfcc,erfcd; 
  r2inv = 1.0/rsq;
  e_shift = erfc(alf*cut_coul) / cut_coul;
  f_shift = -(e_shift+2*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
    cut_coul; 
  if (rsq < cut_coulsq) {
    r = sqrt(rsq);
    prefactor = force->qqrd2e * atom->q[i]*atom->q[j]/r;
    erfcc = erfc(alf*r); 
    erfcd = exp(-alf*alf*r*r);
    dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
    forcecoul = dvdrr*rsq*prefactor;
    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
  } else forcecoul = 0.0;
  if (rsq < cut_ljsq[itype][jtype]) {
    r6inv = r2inv*r2inv*r2inv;
    r = sqrt(rsq);
    rexp = exp(-r*rhoinv[itype][jtype]);
    forceborn = born1[itype][jtype]*r*rexp - born2[itype][jtype]*r6inv
      + born3[itype][jtype]*r2inv*r6inv;
  } else forceborn = 0.0;
  fforce = (forcecoul + factor_lj*forceborn) * r2inv;
  double eng = 0.0;
  if (rsq < cut_coulsq) {
    phicoul = prefactor * (erfcc-e_shift*r);
    if (factor_coul < 1.0) phicoul -= (1.0-factor_coul)*prefactor;
    eng += phicoul;
  }
  if (rsq < cut_ljsq[itype][jtype]) {
    phiborn = a[itype][jtype]*rexp - c[itype][jtype]*r6inv 
      + d[itype][jtype]*r2inv*r6inv - offset[itype][jtype];
    eng += factor_lj*phiborn;
  }
  return eng;
 }
--- a/src/pair_born_coul_wolf.h
+++ b/src/pair_born_coul_wolf.h
@ -0,0 +1,56 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(born/coul/wolf,PairBornCoulWolf)
 #else
 #ifndef LMP_PAIR_BORN_COUL_WOLF_H
 #define LMP_PAIR_BORN_COUL_WOLF_H
 #include "pair.h"
 namespace LAMMPS_NS {
 class PairBornCoulWolf : public Pair {
 public:
  PairBornCoulWolf(class LAMMPS *);
  ~PairBornCoulWolf();
  void compute(int, int);
  void settings(int, char **);
  void coeff(int, char **);
  void init_style();
  double init_one(int, int);
  void write_restart(FILE *);
  void read_restart(FILE *);
  void write_restart_settings(FILE *);
  void read_restart_settings(FILE *);
  double single(int, int, int, int, double, double, double, double &);
 private:
  double cut_lj_global;
  double **cut_lj,**cut_ljsq;
  double cut_coul,cut_coulsq;
  double **a,**rho,**sigma,**c,**d;
  double **rhoinv,**born1,**born2,**born3,**offset;
  double alf,e_shift,f_shift;
  void allocate();
 };
 }
 #endif
 #endif
--- a/src/pair_coul_wolf.cpp
+++ b/src/pair_coul_wolf.cpp
@ -0,0 +1,328 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing author: Yongfeng Zhang (INL), yongfeng.zhang@inl.gov
 ------------------------------------------------------------------------- */
 #include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
 #include "pair_coul_wolf.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;
 /* ---------------------------------------------------------------------- */
 PairCoulWolf::PairCoulWolf(LAMMPS *lmp) : Pair(lmp) {}
 /* ---------------------------------------------------------------------- */
 PairCoulWolf::~PairCoulWolf()
 {
  if (allocated) {
    memory->destroy(setflag);
    memory->destroy(cutsq);
  }
 }
 /* ---------------------------------------------------------------------- */
 void PairCoulWolf::compute(int eflag, int vflag)
 {
  int i,j,ii,jj,inum,jnum,itype,jtype;
  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
  double rsq,forcecoul,factor_coul;
  double prefactor;
  double r,rexp;
  int *ilist,*jlist,*numneigh,**firstneigh;
  double erfcc,erfcd,v_sh,dvdrr,e_self,qisq;
  ecoul = 0.0;
  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = vflag_fdotr = 0;
  double **x = atom->x;
  double **f = atom->f;
  double *q = atom->q;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int nall = nlocal + atom->nghost;
  double *special_coul = force->special_coul;
  int newton_pair = force->newton_pair;
  double qqrd2e = force->qqrd2e;
  // self and shifted coulombic energy
  e_self = v_sh = 0.0; 
  e_shift = erfc(alf*cut_coul)/cut_coul;
  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
    cut_coul; 
  inum = list->inum;
  ilist = list->ilist;
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  // loop over neighbors of my atoms
  for (ii = 0; ii < inum; ii++) {
    i = ilist[ii];
    qtmp = q[i];
    xtmp = x[i][0];
    ytmp = x[i][1];
    ztmp = x[i][2];
    itype = type[i];
    jlist = firstneigh[i];
    jnum = numneigh[i];
    qisq = qtmp*qtmp;
    e_self = -(e_shift/2.0 + alf/MY_PIS) * qisq*qqrd2e;
    if (evflag) ev_tally(i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0);
    for (jj = 0; jj < jnum; jj++) {
      j = jlist[jj];
      if (j < nall) factor_coul  = 1.0;
      else {
 	factor_coul = special_coul[j/nall];
 	j %= nall;
      }
      delx = xtmp - x[j][0];
      dely = ytmp - x[j][1];
      delz = ztmp - x[j][2];
      rsq = delx*delx + dely*dely + delz*delz;
      jtype = type[j];
      if (rsq < cut_coulsq) {
 	r = sqrt(rsq);
 	prefactor = qqrd2e*qtmp*q[j]/r;
 	erfcc = erfc(alf*r); 
 	erfcd = exp(-alf*alf*r*r);
 	v_sh = (erfcc - e_shift*r) * prefactor; 
 	dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
 	forcecoul = dvdrr*rsq*prefactor;
 	if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
 	fpair = forcecoul / rsq;
 	f[i][0] += delx*fpair;
 	f[i][1] += dely*fpair;
 	f[i][2] += delz*fpair;
 	if (newton_pair || j < nlocal) {
 	  f[j][0] -= delx*fpair;
 	  f[j][1] -= dely*fpair;
 	  f[j][2] -= delz*fpair;
 	}
 	if (eflag) {
 	  if (rsq < cut_coulsq) {
 	    ecoul = v_sh;
 	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
 	  } else ecoul = 0.0;
 	}
 	if (evflag) ev_tally(i,j,nlocal,newton_pair,
 			     0.0,ecoul,fpair,delx,dely,delz);
      }
    }
  }
  if (vflag_fdotr) virial_fdotr_compute();
 }
 /* ----------------------------------------------------------------------
   allocate all arrays
 ------------------------------------------------------------------------- */
 void PairCoulWolf::allocate()
 {
  allocated = 1;
  int n = atom->ntypes;
  memory->create(setflag,n+1,n+1,"pair:setflag");
  for (int i = 1; i <= n; i++)
    for (int j = i; j <= n; j++)
      setflag[i][j] = 0;
  memory->create(cutsq,n+1,n+1,"pair:cutsq");
 }
 /* ----------------------------------------------------------------------
   global settings
   unlike other pair styles,
     there are no individual pair settings that these override
 ------------------------------------------------------------------------- */
 void PairCoulWolf::settings(int narg, char **arg)
 {
  if (narg != 2) error->all(FLERR,"Illegal pair_style command");
  alf = force->numeric(arg[0]);
  cut_coul = force->numeric(arg[1]);
 }
 /* ----------------------------------------------------------------------
   set cutoffs for one or more type pairs, optional 
 ------------------------------------------------------------------------- */
 void PairCoulWolf::coeff(int narg, char **arg)
 {
  if (narg != 2) error->all(FLERR,"Incorrect args for pair coefficients");
  if (!allocated) allocate();
  int ilo,ihi,jlo,jhi;
  force->bounds(arg[0],atom->ntypes,ilo,ihi);
  force->bounds(arg[1],atom->ntypes,jlo,jhi);
  int count = 0;
  for (int i = ilo; i <= ihi; i++) {
    for (int j = MAX(jlo,i); j <= jhi; j++) {
      setflag[i][j] = 1;
      count++;
    }
  }
  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style 
 ------------------------------------------------------------------------- */
 void PairCoulWolf::init_style()
 {
  if (!atom->q_flag)
    error->all(FLERR,"Pair coul/wolf requires atom attribute q");
  int irequest = neighbor->request(this);
  cut_coulsq = cut_coul*cut_coul;
 }
 /* ----------------------------------------------------------------------
   init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 double PairCoulWolf::init_one(int i, int j)
 {
  return cut_coul;
 }
 /* ----------------------------------------------------------------------
  proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 void PairCoulWolf::write_restart(FILE *fp)
 {
  write_restart_settings(fp);
  int i,j;
  for (i = 1; i <= atom->ntypes; i++)
    for (j = i; j <= atom->ntypes; j++)
      fwrite(&setflag[i][j],sizeof(int),1,fp);
 }
 /* ----------------------------------------------------------------------
  proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 void PairCoulWolf::read_restart(FILE *fp)
 {
  read_restart_settings(fp);
  allocate();
  int i,j;
  int me = comm->me;
  for (i = 1; i <= atom->ntypes; i++)
    for (j = i; j <= atom->ntypes; j++) {
      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
    }
 }
 /* ----------------------------------------------------------------------
  proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 void PairCoulWolf::write_restart_settings(FILE *fp)
 {
  fwrite(&alf,sizeof(double),1,fp);
  fwrite(&cut_coul,sizeof(double),1,fp);
  fwrite(&offset_flag,sizeof(int),1,fp);
  fwrite(&mix_flag,sizeof(int),1,fp);
 }
 /* ----------------------------------------------------------------------
   proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 void PairCoulWolf::read_restart_settings(FILE *fp)
 {
  if (comm->me == 0) {
    fread(&alf,sizeof(double),1,fp);
    fread(&cut_coul,sizeof(double),1,fp);
    fread(&offset_flag,sizeof(int),1,fp);
    fread(&mix_flag,sizeof(int),1,fp);
  }
  MPI_Bcast(&alf,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&cut_coul,1,MPI_DOUBLE,0,world);
  MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
 }
 /* ----------------------------------------------------------------------
   only the pair part is calculated here
 ------------------------------------------------------------------------- */
 double PairCoulWolf::single(int i, int j, int itype, int jtype, double rsq, 
 			    double factor_coul, double factor_lj,
 			    double &fforce)
 {
  double r6inv,r,prefactor,rexp;
  double forcecoul,forceborn,phicoul;
  double e_shift,f_shift,dvdrr,erfcc,erfcd; 
  e_shift = erfc(alf*cut_coul) / cut_coul;
  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) / 
    cut_coul; 
  if (rsq < cut_coulsq) {
    r = sqrt(rsq);
    prefactor = force->qqrd2e * atom->q[i]*atom->q[j]/r;
    erfcc = erfc(alf*r); 
    erfcd = exp(-alf*alf*r*r);
    dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
    forcecoul = dvdrr*rsq*prefactor;
    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
  } else forcecoul = 0.0;
  fforce = forcecoul / rsq;
  double eng = 0.0;
  if (rsq < cut_coulsq) {
    phicoul = prefactor * (erfcc-e_shift*r);
    if (factor_coul < 1.0) phicoul -= (1.0-factor_coul)*prefactor;
    eng += phicoul;
  }
  return eng;
 }
--- a/src/pair_coul_wolf.h
+++ b/src/pair_coul_wolf.h
@ -0,0 +1,52 @@
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #ifdef PAIR_CLASS
 PairStyle(coul/wolf,PairCoulWolf)
 #else
 #ifndef LMP_PAIR_COUL_WOLF_H
 #define LMP_PAIR_COUL_WOLF_H
 #include "pair.h"
 namespace LAMMPS_NS {
 class PairCoulWolf : public Pair {
 public:
  PairCoulWolf(class LAMMPS *);
  ~PairCoulWolf();
  void compute(int, int);
  void settings(int, char **);
  void coeff(int, char **);
  void init_style();
  double init_one(int, int);
  void write_restart(FILE *);
  void read_restart(FILE *);
  void write_restart_settings(FILE *);
  void read_restart_settings(FILE *);
  double single(int, int, int, int, double, double, double, double &);
 private:
  double cut_coul,cut_coulsq;
  double alf,e_shift,f_shift; 
  void allocate();
 };
 }
 #endif
 #endif
--- a/src/pair_lj_cut.cpp
+++ b/src/pair_lj_cut.cpp
@ -431,7 +431,7 @@ void PairLJCut::settings(int narg, char **arg)
  cut_global = force->numeric(arg[0]);
  // reset cutoffs that have been explicitly set
-
+  
  if (allocated) {
    int i,j;
    for (i = 1; i <= atom->ntypes; i++)
@ -446,7 +446,8 @@ void PairLJCut::settings(int narg, char **arg)
 void PairLJCut::coeff(int narg, char **arg)
 {
-  if (narg < 4 || narg > 5) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (narg < 4 || narg > 5) 
    error->all(FLERR,"Incorrect args for pair coefficients");
  if (!allocated) allocate();
  int ilo,ihi,jlo,jhi;
--- a/src/universe.cpp
+++ b/src/universe.cpp
@ -59,59 +59,81 @@ Universe::~Universe()
 }
 /* ----------------------------------------------------------------------
-   reorder universe processors based on custom file
+   reorder universe processors
   create uni2orig as inverse mapping
   re-create uworld communicator with new ordering via Comm_split()
   style = "nth", arg = N
   move every Nth proc to end of rankings
   style = "custom", arg = filename
   file has nprocs lines with I J
   I = universe proc ID in original communicator uorig
   J = universe proc ID in reordered communicator uworld
   create uni2orig as inverse mapping
   re-create uworld communicator with new ordering via Comm_split()
 ------------------------------------------------------------------------- */
-void Universe::reorder(char *file)
+void Universe::reorder(char *style, char *arg)
 {
  char line[MAXLINE];
  if (uworld != uorig) MPI_Comm_free(&uworld);
-  if (me == 0) {
+  if (strcmp(style,"nth") == 0) {
-    FILE *fp = fopen(file,"r");
+    int n = atoi(arg);
-    if (fp == NULL) error->universe_one(FLERR,"Cannot open -reorder file");
+    if (n <= 0) 
-
+      error->universe_all(FLERR,"Invalid -reorder N value");
-    // skip header = blank and comment lines
+    if (nprocs % n) 
-
+      error->universe_all(FLERR,"Nprocs not a multiple of N for -reorder");
-    char *ptr;
+    for (int i = 0; i < nprocs; i++) {
-    if (!fgets(line,MAXLINE,fp))
+      if (i < (n-1)*nprocs/n) uni2orig[i] = i/(n-1) * n + (i % (n-1));
-      error->one(FLERR,"Unexpected end of -reorder file");
+      else uni2orig[i] = (i - (n-1)*nprocs/n) * n + n-1;
    while (1) {
      if (ptr = strchr(line,'#')) *ptr = '\0';
      if (strspn(line," \t\n\r") != strlen(line)) break;
      if (!fgets(line,MAXLINE,fp))
 	error->one(FLERR,"Unexpected end of -reorder file");
    }
-    // read nprocs lines
+  } else if (strcmp(style,"custom") == 0) {
    // uni2orig = inverse mapping
-    int me_orig,me_new;
+    if (me == 0) {
-    sscanf(line,"%d %d",&me_orig,&me_new);
+      FILE *fp = fopen(arg,"r");
-    if (me_orig < 0 || me_orig >= nprocs ||
+      if (fp == NULL) error->universe_one(FLERR,"Cannot open -reorder file");
 	me_new < 0 || me_new >= nprocs)
      error->one(FLERR,"Invalid entry in reorder file");
    uni2orig[me_new] = me_orig;
-    for (int i = 1; i < nprocs; i++) {
+      // skip header = blank and comment lines
      char *ptr;
      if (!fgets(line,MAXLINE,fp))
-	error->one(FLERR,"Unexpected end of reorder file");
+	error->one(FLERR,"Unexpected end of -reorder file");
-      sscanf(line,"%ld %ld",&me_orig,&me_new);
+      while (1) {
 	if (ptr = strchr(line,'#')) *ptr = '\0';
 	if (strspn(line," \t\n\r") != strlen(line)) break;
 	if (!fgets(line,MAXLINE,fp))
 	  error->one(FLERR,"Unexpected end of -reorder file");
      }
      // read nprocs lines
      // uni2orig = inverse mapping
      int me_orig,me_new;
      sscanf(line,"%d %d",&me_orig,&me_new);
      if (me_orig < 0 || me_orig >= nprocs ||
 	  me_new < 0 || me_new >= nprocs)
 	error->one(FLERR,"Invalid entry in reorder file");
      uni2orig[me_new] = me_orig;
    }
    fclose(fp);
  }
-  MPI_Bcast(uni2orig,nprocs,MPI_INT,0,uorig);
+      for (int i = 1; i < nprocs; i++) {
 	if (!fgets(line,MAXLINE,fp))
 	  error->one(FLERR,"Unexpected end of reorder file");
 	sscanf(line,"%d %d",&me_orig,&me_new);
 	if (me_orig < 0 || me_orig >= nprocs ||
 	    me_new < 0 || me_new >= nprocs)
 	  error->one(FLERR,"Invalid entry in reorder file");
 	uni2orig[me_new] = me_orig;
      }
      fclose(fp);
    }
    // bcast uni2org from proc 0 to all other universe procs
    MPI_Bcast(uni2orig,nprocs,MPI_INT,0,uorig);
  } else error->universe_all(FLERR,"Invalid command-line argument");
  // create new uworld communicator
  int ome,key;
  MPI_Comm_rank(uorig,&ome);
--- a/src/universe.h
+++ b/src/universe.h
@ -41,7 +41,7 @@ class Universe : protected Pointers {
  Universe(class LAMMPS *, MPI_Comm);
  ~Universe();
-  void reorder(char *);
+  void reorder(char *, char *);
  void add_world(char *);
  int consistent();
 };
--- a/src/variable.cpp
+++ b/src/variable.cpp
@ -29,11 +29,10 @@
 #include "output.h"
 #include "thermo.h"
 #include "random_mars.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 #include "math_const.h"
 using namespace LAMMPS_NS;
 using namespace MathConst;