diff --git a/doc/JPG/sinusoid.jpg b/doc/JPG/sinusoid.jpg
new file mode 100644
index 0000000000..00969281a1
Binary files /dev/null and b/doc/JPG/sinusoid.jpg differ
diff --git a/doc/JPG/sinusoid_small.jpg b/doc/JPG/sinusoid_small.jpg
new file mode 100644
index 0000000000..1643d81698
Binary files /dev/null and b/doc/JPG/sinusoid_small.jpg differ
diff --git a/doc/Manual.html b/doc/Manual.html
index 43e96bec91..fa49ba5bdf 100644
--- a/doc/Manual.html
+++ b/doc/Manual.html
@@ -1,7 +1,7 @@
 <HTML>
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="22 Jul 2014 version">
+<META NAME="docnumber" CONTENT="29 Jul 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -22,7 +22,7 @@
 
 <CENTER><H3>LAMMPS-ICMS Documentation 
 </H3></CENTER>
-<CENTER><H4>22 Jul 2014 version 
+<CENTER><H4>29 Jul 2014 version 
 </H4></CENTER>
 <H4>Version info: 
 </H4>
diff --git a/doc/Manual.txt b/doc/Manual.txt
index 28755def04..b2edaf4851 100644
--- a/doc/Manual.txt
+++ b/doc/Manual.txt
@@ -1,6 +1,6 @@
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="22 Jul 2014 version">
+<META NAME="docnumber" CONTENT="29 Jul 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -18,7 +18,7 @@
 <H1></H1>
 
 LAMMPS-ICMS Documentation :c,h3
-22 Jul 2014 version :c,h4
+29 Jul 2014 version :c,h4
 
 Version info: :h4
 
diff --git a/doc/Section_errors.txt b/doc/Section_errors.txt
index 39c73b8831..91878bc777 100644
--- a/doc/Section_errors.txt
+++ b/doc/Section_errors.txt
@@ -988,14 +988,6 @@ will assign the restart file info to actual atoms. :dd
 This is a restriction due to the way atoms are organized in a list to
 enable the atom_modify first command. :dd
 
-{Support for writing images in JPEG format not included} :dt
-
-LAMMPS was not built with the -DLAMMPS_JPEG switch in the Makefile. :dd
-
-{Support for writing images in PNG format not included} :dt
-
-LAMMPS was not built with the -DLAMMPS_PNG switch in the Makefile. :dd
-
 {Cannot dump sort on atom IDs with no atom IDs defined} :dt
 
 Self-explanatory. :dd
diff --git a/doc/create_atoms.html b/doc/create_atoms.html
index 6f06935377..eff378a418 100644
--- a/doc/create_atoms.html
+++ b/doc/create_atoms.html
@@ -31,7 +31,7 @@
 </PRE>
 <LI>zero or more keyword/value pairs may be appended 
 
-<LI>keyword = <I>mol</I> or <I>basis</I> or <I>remap</I> or <I>units</I> 
+<LI>keyword = <I>mol</I> or <I>basis</I> or <I>remap</I> or <I>var</I> or <I>set</I> or <I>units</I> 
 
 <PRE>  <I>mol</I> value = template-ID seed
     template-ID = ID of molecule template specified in a separate <A HREF = "molecule.html">molecule</A> command
@@ -40,6 +40,10 @@
     M = which basis atom
     itype = atom type (1-N) to assign to this basis atom
   <I>remap</I> value = <I>yes</I> or <I>no</I>
+  <I>var</I> value = name = variable name to evaluate for test of atom creation
+  <I>set</I> values = dim vname
+    dim = <I>x</I> or <I>y</I> or <I>z</I>
+    name = name of variable to set with x,y,z atom position
   <I>units</I> value = <I>lattice</I> or <I>box</I>
     <I>lattice</I> = the geometry is defined in lattice units
     <I>box</I> = the geometry is defined in simulation box units 
@@ -51,6 +55,7 @@
 <PRE>create_atoms 1 box
 create_atoms 3 region regsphere basis 2 3
 create_atoms 3 single 0 0 5 
+create_atoms 1 box var v set x xpos set y ypos 
 </PRE>
 <P><B>Description:</B>
 </P>
@@ -189,6 +194,45 @@ box, it will mapped back into the box, assuming the relevant
 dimensions are periodic.  If it is set to <I>no</I>, no remapping is done
 and no particle is created if its position is outside the box.
 </P>
+<P>The <I>var</I> and <I>set</I> keywords can be used to provide a criterion for
+accepting or rejecting the addition of an individual atom, based on
+its coordinates.  The <I>vname</I> specified for the <I>var</I> keyword is the
+name of an <A HREF = "variable.html">equal-style variable</A> which should evaluate
+to a zero or non-zero value based on one or two or three variables
+which will store the x, y, or z coordinates of an atom (one variable
+per coordinate).  These other variables must be <A HREF = "variable.html">equal-style
+variables</A> defined in the input script, but their
+formula can by anything.  The <I>set</I> keyword is used to identify the
+names of these other variables, one variable for the x-coordinate of a
+created atom, one for y, and one for z.
+</P>
+<P>When an atom is created, its x, y, or z coordinates override the
+formula for any <I>set</I> variable that is defined.  The <I>var</I> variable is
+then evaluated.  If the returned value is 0.0, the atom is not
+created.  If it is non-zero, the atom is created.
+</P>
+<P>As an example, these commands can be used in a 2d simulation, to
+create a sinusoidal surface.  Note that the surface is "rough" due to
+individual lattice points being "above" or "below" the mathematical
+expression for the sinusoidal curve.  If a finer lattice were used,
+the sinusoid would appear to be "smoother".  Also note the use of the
+"xlat" and "ylat" <A HREF = "thermo_style.html">thermo_style</A> keywords which
+converts lattice spacings to distance.  Click on the image for a
+larger version.
+</P>
+<PRE>variable        x equal 100
+variable        y equal 25
+lattice		hex 0.8442
+region		box block 0 $x 0 $y -0.5 0.5
+create_box	1 box 
+</PRE>
+<PRE>variable        xx equal 0.0
+variable        yy equal 0.0
+variable        v equal "(0.2*v_y*ylat * cos(v_xx/xlat * 2.0*PI*4.0/v_x) + 0.5*v_y*ylat - v_yy) > 0.0"
+create_atoms	1 box var v set x xx set y yy 
+</PRE>
+<CENTER><A HREF = "JPG/sinusoid.jpg"><IMG SRC = "JPG/sinusoid_small.jpg"></A>
+</CENTER>
 <P>The <I>units</I> keyword determines the meaning of the distance units used
 to specify the coordinates of the one particle created by the <I>single</I>
 style.  A <I>box</I> value selects standard distance units as defined by
diff --git a/doc/create_atoms.txt b/doc/create_atoms.txt
index 4552d43e9a..f395b87b32 100644
--- a/doc/create_atoms.txt
+++ b/doc/create_atoms.txt
@@ -24,7 +24,7 @@ style = {box} or {region} or {single} or {random} :l
     seed = random # seed (positive integer)
     region-ID = create atoms within this region, use NULL for entire simulation box :pre
 zero or more keyword/value pairs may be appended :l
-keyword = {mol} or {basis} or {remap} or {units} :l
+keyword = {mol} or {basis} or {remap} or {var} or {set} or {units} :l
   {mol} value = template-ID seed
     template-ID = ID of molecule template specified in a separate "molecule"_molecule.html command
     seed = random # seed (positive integer)
@@ -32,6 +32,10 @@ keyword = {mol} or {basis} or {remap} or {units} :l
     M = which basis atom
     itype = atom type (1-N) to assign to this basis atom
   {remap} value = {yes} or {no}
+  {var} value = name = variable name to evaluate for test of atom creation
+  {set} values = dim vname
+    dim = {x} or {y} or {z}
+    name = name of variable to set with x,y,z atom position
   {units} value = {lattice} or {box}
     {lattice} = the geometry is defined in lattice units
     {box} = the geometry is defined in simulation box units :pre
@@ -41,7 +45,8 @@ keyword = {mol} or {basis} or {remap} or {units} :l
 
 create_atoms 1 box
 create_atoms 3 region regsphere basis 2 3
-create_atoms 3 single 0 0 5 :pre
+create_atoms 3 single 0 0 5 
+create_atoms 1 box var v set x xpos set y ypos :pre
 
 [Description:]
 
@@ -180,6 +185,45 @@ box, it will mapped back into the box, assuming the relevant
 dimensions are periodic.  If it is set to {no}, no remapping is done
 and no particle is created if its position is outside the box.
 
+The {var} and {set} keywords can be used to provide a criterion for
+accepting or rejecting the addition of an individual atom, based on
+its coordinates.  The {vname} specified for the {var} keyword is the
+name of an "equal-style variable"_variable.html which should evaluate
+to a zero or non-zero value based on one or two or three variables
+which will store the x, y, or z coordinates of an atom (one variable
+per coordinate).  These other variables must be "equal-style
+variables"_variable.html defined in the input script, but their
+formula can by anything.  The {set} keyword is used to identify the
+names of these other variables, one variable for the x-coordinate of a
+created atom, one for y, and one for z.
+
+When an atom is created, its x, y, or z coordinates override the
+formula for any {set} variable that is defined.  The {var} variable is
+then evaluated.  If the returned value is 0.0, the atom is not
+created.  If it is non-zero, the atom is created.
+
+As an example, these commands can be used in a 2d simulation, to
+create a sinusoidal surface.  Note that the surface is "rough" due to
+individual lattice points being "above" or "below" the mathematical
+expression for the sinusoidal curve.  If a finer lattice were used,
+the sinusoid would appear to be "smoother".  Also note the use of the
+"xlat" and "ylat" "thermo_style"_thermo_style.html keywords which
+converts lattice spacings to distance.  Click on the image for a
+larger version.
+
+variable        x equal 100
+variable        y equal 25
+lattice		hex 0.8442
+region		box block 0 $x 0 $y -0.5 0.5
+create_box	1 box :pre
+
+variable        xx equal 0.0
+variable        yy equal 0.0
+variable        v equal "(0.2*v_y*ylat * cos(v_xx/xlat * 2.0*PI*4.0/v_x) + 0.5*v_y*ylat - v_yy) > 0.0"
+create_atoms	1 box var v set x xx set y yy :pre
+
+:c,image(JPG/sinusoid_small.jpg,JPG/sinusoid.jpg)
+
 The {units} keyword determines the meaning of the distance units used
 to specify the coordinates of the one particle created by the {single}
 style.  A {box} value selects standard distance units as defined by
diff --git a/doc/dump.html b/doc/dump.html
index e83d9f5c5f..c13777fd3e 100644
--- a/doc/dump.html
+++ b/doc/dump.html
@@ -57,7 +57,7 @@
       f_ID[N] = Nth column of local array calculated by a fix with ID 
 </PRE>
 <PRE>  <I>custom</I> of <I>custom/mpiio</I> args = list of atom attributes
-    possible attributes = id, mol, type, element, mass,
+    possible attributes = id, mol, proc, procp1, type, element, mass,
 			  x, y, z, xs, ys, zs, xu, yu, zu, 
 			  xsu, ysu, zsu, ix, iy, iz,
 			  vx, vy, vz, fx, fy, fz,
@@ -69,6 +69,7 @@
 <PRE>      id = atom ID
       mol = molecule ID
       proc = ID of processor that owns atom
+      procp1 = ID+1 of processor that owns atom
       type = atom type
       element = name of atom element, as defined by <A HREF = "dump_modify.html">dump_modify</A> command
       mass = atom mass
@@ -460,18 +461,20 @@ dump 1 all local 1000 tmp.dump index c_1[1] c_1[2] c_1[3] c_2[1] c_2[2]
 <P>This section explains the atom attributes that can be specified as
 part of the <I>custom</I> and <I>cfg</I> styles.
 </P>
-<P>The <I>id</I>, <I>mol</I>, <I>proc</I>, <I>type</I>, <I>element</I>, <I>mass</I>, <I>vx</I>, <I>vy</I>, <I>vz</I>,
-<I>fx</I>, <I>fy</I>, <I>fz</I>, <I>q</I> attributes are self-explanatory.
+<P>The <I>id</I>, <I>mol</I>, <I>proc</I>, <I>procp1</I>, <I>type</I>, <I>element</I>, <I>mass</I>, <I>vx</I>,
+<I>vy</I>, <I>vz</I>, <I>fx</I>, <I>fy</I>, <I>fz</I>, <I>q</I> attributes are self-explanatory.
 </P>
 <P><I>Id</I> is the atom ID.  <I>Mol</I> is the molecule ID, included in the data
 file for molecular systems.  <I>Proc</I> is the ID of the processor (0 to
-Nprocs-1) that currently owns the atom.  <I>Type</I> is the atom type.
-<I>Element</I> is typically the chemical name of an element, which you must
-assign to each type via the <A HREF = "dump_modify.html">dump_modify element</A>
-command.  More generally, it can be any string you wish to associated
-with an atom type.  <I>Mass</I> is the atom mass.  <I>Vx</I>, <I>vy</I>, <I>vz</I>, <I>fx</I>,
-<I>fy</I>, <I>fz</I>, and <I>q</I> are components of atom velocity and force and
-atomic charge.
+Nprocs-1) that currently owns the atom.  <I>Procp1</I> is the proc ID+1,
+which can be convenient in place of a <I>type</I> attribute (1 to Ntypes)
+for coloring atoms in a visualization program.  <I>Type</I> is the atom
+type (1 to Ntypes).  <I>Element</I> is typically the chemical name of an
+element, which you must assign to each type via the <A HREF = "dump_modify.html">dump_modify
+element</A> command.  More generally, it can be any
+string you wish to associated with an atom type.  <I>Mass</I> is the atom
+mass.  <I>Vx</I>, <I>vy</I>, <I>vz</I>, <I>fx</I>, <I>fy</I>, <I>fz</I>, and <I>q</I> are components of
+atom velocity and force and atomic charge.
 </P>
 <P>There are several options for outputting atom coordinates.  The <I>x</I>,
 <I>y</I>, <I>z</I> attributes write atom coordinates "unscaled", in the
diff --git a/doc/dump.txt b/doc/dump.txt
index 3f83f217ca..da12e99ed7 100644
--- a/doc/dump.txt
+++ b/doc/dump.txt
@@ -44,7 +44,7 @@ args = list of arguments for a particular style :l
       f_ID\[N\] = Nth column of local array calculated by a fix with ID :pre
 
   {custom} of {custom/mpiio} args = list of atom attributes
-    possible attributes = id, mol, type, element, mass,
+    possible attributes = id, mol, proc, procp1, type, element, mass,
 			  x, y, z, xs, ys, zs, xu, yu, zu, 
 			  xsu, ysu, zsu, ix, iy, iz,
 			  vx, vy, vz, fx, fy, fz,
@@ -56,6 +56,7 @@ args = list of arguments for a particular style :l
       id = atom ID
       mol = molecule ID
       proc = ID of processor that owns atom
+      procp1 = ID+1 of processor that owns atom
       type = atom type
       element = name of atom element, as defined by "dump_modify"_dump_modify.html command
       mass = atom mass
@@ -78,8 +79,8 @@ args = list of arguments for a particular style :l
       f_ID = per-atom vector calculated by a fix with ID
       f_ID\[N\] = Nth column of per-atom array calculated by a fix with ID
       v_name = per-atom vector calculated by an atom-style variable with name
-      d_name = per-atom floating point vector with name managed by fix property/atom
-      i_name = per-atom integer vector with name managed by fix property/atom :pre
+      d_name = per-atom floating point vector with name, managed by fix property/atom
+      i_name = per-atom integer vector with name, managed by fix property/atom :pre
 :ule
 
 [Examples:]
@@ -112,7 +113,7 @@ options; see details below.
 
 As described below, the filename determines the kind of output (text
 or binary or gzipped, one big file or one per timestep, one big file
-or one per processor).
+or multiple smaller files).
 
 IMPORTANT NOTE: Because periodic boundary conditions are enforced only
 on timesteps when neighbor lists are rebuilt, the coordinates of an
@@ -127,7 +128,7 @@ if the "atom_modify sort"_atom_modify.html option is on, which it is
 by default.  In this case atoms are re-ordered periodically during a
 simulation, due to spatial sorting.  It is also true when running in
 parallel, because data for a single snapshot is collected from
-multiple processors.
+multiple processors, each of which owns a subset of the atoms.
 
 For the {atom}, {custom}, {cfg}, and {local} styles, sorting is off by
 default.  For the {dcd}, {xtc}, {xyz}, and {molfile} styles, sorting by
@@ -295,11 +296,12 @@ from using the (numerical) atom type to an element name (or some
 other label). This will help many visualization programs to guess
 bonds and colors.
 
-Note that {atom}, {custom}, {dcd}, {xtc}, and {xyz} style dump files can
-be read directly by "VMD"_http://www.ks.uiuc.edu/Research/vmd (a popular
-molecular viewing program).  See "Section tools"_Section_tools.html#vmd
-of the manual and the tools/lmp2vmd/README.txt file for more information
-about support in VMD for reading and visualizing LAMMPS dump files.
+Note that {atom}, {custom}, {dcd}, {xtc}, and {xyz} style dump files
+can be read directly by "VMD"_http://www.ks.uiuc.edu/Research/vmd, a
+popular molecular viewing program.  See "Section
+tools"_Section_tools.html#vmd of the manual and the
+tools/lmp2vmd/README.txt file for more information about support in
+VMD for reading and visualizing LAMMPS dump files.
 
 :line
 
@@ -332,7 +334,7 @@ tmp.dump.20000, etc.  This option is not available for the {dcd} and
 {xtc} styles.  Note that the "dump_modify pad"_dump_modify.html
 command can be used to insure all timestep numbers are the same length
 (e.g. 00010), which can make it easier to read a series of dump files
-in order by some post-processing tools.
+in order with some post-processing tools.
 
 If a "%" character appears in the filename, then each of P processors
 writes a portion of the dump file, and the "%" character is replaced
@@ -447,18 +449,20 @@ dump 1 all local 1000 tmp.dump index c_1\[1\] c_1\[2\] c_1\[3\] c_2\[1\] c_2\[2\
 This section explains the atom attributes that can be specified as
 part of the {custom} and {cfg} styles.
 
-The {id}, {mol}, {proc}, {type}, {element}, {mass}, {vx}, {vy}, {vz},
-{fx}, {fy}, {fz}, {q} attributes are self-explanatory.
+The {id}, {mol}, {proc}, {procp1}, {type}, {element}, {mass}, {vx},
+{vy}, {vz}, {fx}, {fy}, {fz}, {q} attributes are self-explanatory.
 
 {Id} is the atom ID.  {Mol} is the molecule ID, included in the data
 file for molecular systems.  {Proc} is the ID of the processor (0 to
-Nprocs-1) that currently owns the atom.  {Type} is the atom type.
-{Element} is typically the chemical name of an element, which you must
-assign to each type via the "dump_modify element"_dump_modify.html
-command.  More generally, it can be any string you wish to associated
-with an atom type.  {Mass} is the atom mass.  {Vx}, {vy}, {vz}, {fx},
-{fy}, {fz}, and {q} are components of atom velocity and force and
-atomic charge.
+Nprocs-1) that currently owns the atom.  {Procp1} is the proc ID+1,
+which can be convenient in place of a {type} attribute (1 to Ntypes)
+for coloring atoms in a visualization program.  {Type} is the atom
+type (1 to Ntypes).  {Element} is typically the chemical name of an
+element, which you must assign to each type via the "dump_modify
+element"_dump_modify.html command.  More generally, it can be any
+string you wish to associated with an atom type.  {Mass} is the atom
+mass.  {Vx}, {vy}, {vz}, {fx}, {fy}, {fz}, and {q} are components of
+atom velocity and force and atomic charge.
 
 There are several options for outputting atom coordinates.  The {x},
 {y}, {z} attributes write atom coordinates "unscaled", in the
diff --git a/doc/fix_store_state.txt b/doc/fix_store_state.txt
index e40a0f49cc..01f73a3bbe 100644
--- a/doc/fix_store_state.txt
+++ b/doc/fix_store_state.txt
@@ -46,8 +46,8 @@ input = one or more atom attributes :l
       f_ID = per-atom vector calculated by a fix with ID
       f_ID\[I\] = Ith column of per-atom array calculated by a fix with ID
       v_name = per-atom vector calculated by an atom-style variable with name
-      d_name = per-atom floating point vector managed by fix property/atom
-      i_name = per-atom integer vector managed by fix property/atom :pre
+      d_name = per-atom floating point vector name, managed by fix property/atom
+      i_name = per-atom integer vector name, managed by fix property/atom :pre
 
 zero or more keyword/value pairs may be appended :l
 keyword = {com} :l
diff --git a/doc/pair_table.html b/doc/pair_table.html
index 21d9fc9d1f..1919b40002 100644
--- a/doc/pair_table.html
+++ b/doc/pair_table.html
@@ -119,8 +119,8 @@ to very steep parts of the potential.
 </UL>
 <HR>
 
-<P>The format of a tabulated file is as follows (without the
-parenthesized comments):
+<P>The format of a tabulated file is a series of one or more sections,
+defined as follows (without the parenthesized comments):
 </P>
 <PRE># Morse potential for Fe   (one or more comment or blank lines) 
 </PRE>
diff --git a/doc/pair_table.txt b/doc/pair_table.txt
index 4b83d2a8f5..8938de20b7 100644
--- a/doc/pair_table.txt
+++ b/doc/pair_table.txt
@@ -113,8 +113,8 @@ to very steep parts of the potential. :l,ule
 
 :line
 
-The format of a tabulated file is as follows (without the
-parenthesized comments):
+The format of a tabulated file is a series of one or more sections,
+defined as follows (without the parenthesized comments):
 
 # Morse potential for Fe   (one or more comment or blank lines) :pre
 
diff --git a/doc/variable.html b/doc/variable.html
index 51daea0bf2..5222302cbe 100644
--- a/doc/variable.html
+++ b/doc/variable.html
@@ -50,7 +50,7 @@
     constants = PI
     thermo keywords = vol, ke, press, etc from <A HREF = "thermo_style.html">thermo_style</A>
     math operators = (), -x, x+y, x-y, x*y, x/y, x^y, x%y,
-                     x==y, x!=y, x<y, x<=y, x>y, x>=y, x&&y, x||y, !x
+                     x == y, x != y, x &lt y, x &lt= y, x &gt y, x &gt= y, x && y, x || y, !x
     math functions = sqrt(x), exp(x), ln(x), log(x), abs(x),
                      sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x),
                      random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x)
@@ -368,7 +368,8 @@ references to other variables.
 <TR><TD >Number</TD><TD > 0.2, 100, 1.0e20, -15.4, etc</TD></TR>
 <TR><TD >Constant</TD><TD > PI</TD></TR>
 <TR><TD >Thermo keywords</TD><TD > vol, pe, ebond, etc</TD></TR>
-<TR><TD >Math operators</TD><TD > (), -x, x+y, x-y, x*y, x/y, x^y, x%y, x==y, x!=y, x<y, x<=y, x>y, x>=y, x&&y, x||y, !x</TD></TR>
+<TR><TD >Math operators</TD><TD > (), -x, x+y, x-y, x*y, x/y, x^y, x%y, </TD></TR>
+<TR><TD >Math operators</TD><TD > (), -x, x+y, x-y, x*y, x/y, x^y, x%y, x == y, x != y, x &lt y, x &lt= y, x &gt y, x &gt= y, x && y, x || y, !x</TD></TR>
 <TR><TD >Math functions</TD><TD > sqrt(x), exp(x), ln(x), log(x), abs(x), sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x), random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x), ramp(x,y), stagger(x,y), logfreq(x,y,z), stride(x,y,z), vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)</TD></TR>
 <TR><TD >Group functions</TD><TD > count(ID), mass(ID), charge(ID), xcm(ID,dim),                  vcm(ID,dim), fcm(ID,dim), bound(ID,dir), 		 gyration(ID), ke(ID), angmom(ID,dim), torque(ID,dim), 		 inertia(ID,dimdim), omega(ID,dim)</TD></TR>
 <TR><TD >Region functions</TD><TD > count(ID,IDR), mass(ID,IDR), charge(ID,IDR), 		  xcm(ID,dim,IDR), vcm(ID,dim,IDR), fcm(ID,dim,IDR), 		  bound(ID,dir,IDR), gyration(ID,IDR), ke(ID,IDR), 		  angmom(ID,dim,IDR), torque(ID,dim,IDR),                   inertia(ID,dimdim,IDR), omega(ID,dim,IDR)</TD></TR>
diff --git a/doc/variable.txt b/doc/variable.txt
index e9362de392..d7323b3efe 100644
--- a/doc/variable.txt
+++ b/doc/variable.txt
@@ -45,7 +45,7 @@ style = {delete} or {index} or {loop} or {world} or {universe} or {uloop} or {st
     constants = PI
     thermo keywords = vol, ke, press, etc from "thermo_style"_thermo_style.html
     math operators = (), -x, x+y, x-y, x*y, x/y, x^y, x%y,
-                     x==y, x!=y, x<y, x<=y, x>y, x>=y, x&&y, x||y, !x
+                     x == y, x != y, x &lt y, x &lt= y, x &gt y, x &gt= y, x && y, x || y, !x
     math functions = sqrt(x), exp(x), ln(x), log(x), abs(x),
                      sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x),
                      random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x)
@@ -361,7 +361,8 @@ references to other variables.
 Number: 0.2, 100, 1.0e20, -15.4, etc
 Constant: PI
 Thermo keywords: vol, pe, ebond, etc
-Math operators: (), -x, x+y, x-y, x*y, x/y, x^y, x%y, x==y, x!=y, x<y, x<=y, x>y, x>=y, x&&y, x||y, !x
+Math operators: (), -x, x+y, x-y, x*y, x/y, x^y, x%y, 
+Math operators: (), -x, x+y, x-y, x*y, x/y, x^y, x%y, x == y, x != y, x &lt y, x &lt= y, x &gt y, x &gt= y, x && y, x || y, !x
 Math functions: sqrt(x), exp(x), ln(x), log(x), abs(x), sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x), random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x), ramp(x,y), stagger(x,y), logfreq(x,y,z), stride(x,y,z), vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)
 Group functions: count(ID), mass(ID), charge(ID), xcm(ID,dim), \
                  vcm(ID,dim), fcm(ID,dim), bound(ID,dir), \
diff --git a/src/GRANULAR/fix_pour.cpp b/src/GRANULAR/fix_pour.cpp
index 73a13e2192..9947d9a84a 100644
--- a/src/GRANULAR/fix_pour.cpp
+++ b/src/GRANULAR/fix_pour.cpp
@@ -37,10 +37,11 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace MathConst;
 
-#define EPSILON 0.001
-
 enum{ATOM,MOLECULE};
 enum{ONE,RANGE,POLY};
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
+
+#define EPSILON 0.001
 
 /* ---------------------------------------------------------------------- */
 
@@ -598,13 +599,25 @@ void FixPour::pre_exchange()
       if (newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
           newcoord[1] >= sublo[1] && newcoord[1] < subhi[1] &&
           newcoord[2] >= sublo[2] && newcoord[2] < subhi[2]) flag = 1;
-      else if (dimension == 3 && newcoord[2] >= domain->boxhi[2] &&
-               comm->myloc[2] == comm->procgrid[2]-1 &&
-               newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
-               newcoord[1] >= sublo[1] && newcoord[1] < subhi[1]) flag = 1;
-      else if (dimension == 2 && newcoord[1] >= domain->boxhi[1] &&
-               comm->myloc[1] == comm->procgrid[1]-1 &&
-               newcoord[0] >= sublo[0] && newcoord[0] < subhi[0]) flag = 1;
+      else if (dimension == 3 && newcoord[2] >= domain->boxhi[2]) {
+        if (comm->layout != LAYOUT_TILED) {
+          if (comm->myloc[2] == comm->procgrid[2]-1 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
+              newcoord[1] >= sublo[1] && newcoord[1] < subhi[1]) flag = 1;
+        } else {
+          if (comm->mysplit[2][1] == 1.0 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
+              newcoord[1] >= sublo[1] && newcoord[1] < subhi[1]) flag = 1;
+        }
+      } else if (dimension == 2 && newcoord[1] >= domain->boxhi[1]) {
+        if (comm->layout != LAYOUT_TILED) {
+          if (comm->myloc[1] == comm->procgrid[1]-1 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0]) flag = 1;
+        } else {
+          if (comm->mysplit[1][1] == 1.0 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0]) flag = 1;
+        }
+      }
 
       if (flag) {
         if (mode == ATOM) atom->avec->create_atom(ntype,coords[m]);
diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index 5ce73ee41f..81dbc98b47 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -148,6 +148,9 @@ void MSM::init()
   triclinic_check();
   if (domain->dimension == 2)
     error->all(FLERR,"Cannot (yet) use MSM with 2d simulation");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"MSM can only currently be used with "
+                        "comm_style brick");
 
   if (!atom->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
 
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 4d45ef2227..ec61833f76 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -183,6 +183,9 @@ void PPPM::init()
                "slab correction");
   if (domain->dimension == 2) error->all(FLERR,
                                          "Cannot use PPPM with 2d simulation");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"PPPM can only currently be used with "
+                        "comm_style brick");
 
   if (!atom->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
 
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 195aa47af3..1dfea3bf77 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -213,6 +213,9 @@ void PPPMDisp::init()
   triclinic_check();
   if (domain->dimension == 2)
     error->all(FLERR,"Cannot use PPPMDisp with 2d simulation");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"PPPMDisp can only currently be used with "
+                        "comm_style brick");
 
   if (slabflag == 0 && domain->nonperiodic > 0)
     error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDisp");
diff --git a/src/MISC/fix_deposit.cpp b/src/MISC/fix_deposit.cpp
index a804028397..213d39329b 100644
--- a/src/MISC/fix_deposit.cpp
+++ b/src/MISC/fix_deposit.cpp
@@ -37,6 +37,7 @@ using namespace FixConst;
 using namespace MathConst;
 
 enum{ATOM,MOLECULE};
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 #define EPSILON 1.0e6
 
@@ -452,13 +453,25 @@ void FixDeposit::pre_exchange()
       if (newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
           newcoord[1] >= sublo[1] && newcoord[1] < subhi[1] &&
           newcoord[2] >= sublo[2] && newcoord[2] < subhi[2]) flag = 1;
-      else if (dimension == 3 && newcoord[2] >= domain->boxhi[2] &&
-               comm->myloc[2] == comm->procgrid[2]-1 &&
-               newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
-               newcoord[1] >= sublo[1] && newcoord[1] < subhi[1]) flag = 1;
-      else if (dimension == 2 && newcoord[1] >= domain->boxhi[1] &&
-               comm->myloc[1] == comm->procgrid[1]-1 &&
-               newcoord[0] >= sublo[0] && newcoord[0] < subhi[0]) flag = 1;
+      else if (dimension == 3 && newcoord[2] >= domain->boxhi[2]) {
+        if (comm->layout != LAYOUT_TILED) {
+          if (comm->myloc[2] == comm->procgrid[2]-1 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
+              newcoord[1] >= sublo[1] && newcoord[1] < subhi[1]) flag = 1;
+        } else {
+          if (comm->mysplit[2][1] == 1.0 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0] &&
+              newcoord[1] >= sublo[1] && newcoord[1] < subhi[1]) flag = 1;
+        } 
+      } else if (dimension == 2 && newcoord[1] >= domain->boxhi[1]) {
+        if (comm->layout != LAYOUT_TILED) {
+          if (comm->myloc[1] == comm->procgrid[1]-1 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0]) flag = 1;
+        } else {
+          if (comm->mysplit[1][1] == 1.0 &&
+              newcoord[0] >= sublo[0] && newcoord[0] < subhi[0]) flag = 1;
+        }
+      }
 
       if (flag) {
         if (mode == ATOM) atom->avec->create_atom(ntype,coords[m]);
diff --git a/src/REPLICA/verlet_split.cpp b/src/REPLICA/verlet_split.cpp
index ac45b4f56d..734b070541 100644
--- a/src/REPLICA/verlet_split.cpp
+++ b/src/REPLICA/verlet_split.cpp
@@ -52,6 +52,9 @@ VerletSplit::VerletSplit(LAMMPS *lmp, int narg, char **arg) :
   if (universe->procs_per_world[0] % universe->procs_per_world[1])
     error->universe_all(FLERR,"Verlet/split requires Rspace partition "
                         "size be multiple of Kspace partition size");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Verlet/split can only currently be used with "
+                        "comm_style brick");
 
   // master = 1 for Rspace procs, 0 for Kspace procs
 
@@ -214,6 +217,9 @@ VerletSplit::~VerletSplit()
 
 void VerletSplit::init()
 {
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Verlet/split can only currently be used with "
+                        "comm_style brick");
   if (!force->kspace && comm->me == 0)
     error->warning(FLERR,"No Kspace calculation with verlet/split");
 
diff --git a/src/SHOCK/fix_append_atoms.cpp b/src/SHOCK/fix_append_atoms.cpp
index 5e7af01f6e..571e77b129 100644
--- a/src/SHOCK/fix_append_atoms.cpp
+++ b/src/SHOCK/fix_append_atoms.cpp
@@ -29,6 +29,8 @@
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
+
 #define BIG      1.0e30
 #define EPSILON  1.0e-6
 
@@ -410,7 +412,15 @@ void FixAppendAtoms::pre_exchange()
 
   if (ntimestep % freq == 0) {
     if (spatflag==1) if (get_spatial()==0) return;
-    if (comm->myloc[2] == comm->procgrid[2]-1) {
+
+    int addflag = 0;
+    if (comm->layout != LAYOUT_TILED) {
+      if (comm->myloc[2] == comm->procgrid[2]-1) addflag = 1;
+    } else {
+      if (comm->mysplit[2][1] == 1.0) addflag = 1;
+    }
+
+    if (addflag) {
       double bboxlo[3],bboxhi[3];
 
       bboxlo[0] = domain->sublo[0]; bboxhi[0] = domain->subhi[0];
diff --git a/src/SRD/fix_srd.cpp b/src/SRD/fix_srd.cpp
index 3db73bab51..37dc068797 100644
--- a/src/SRD/fix_srd.cpp
+++ b/src/SRD/fix_srd.cpp
@@ -334,9 +334,12 @@ void FixSRD::init()
   if (bigexist && comm->ghost_velocity == 0)
     error->all(FLERR,"Fix srd requires ghost atoms store velocity");
   if (bigexist && collidestyle == NOSLIP && !atom->torque_flag)
-    error->all(FLERR,"Fix SRD no-slip requires atom attribute torque");
+    error->all(FLERR,"Fix srd no-slip requires atom attribute torque");
   if (initflag && update->dt != dt_big)
     error->all(FLERR,"Cannot change timestep once fix srd is setup");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Fix srd can only currently be used with "
+                        "comm_style brick");
 
   // orthogonal vs triclinic simulation box
   // could be static or shearing box
diff --git a/src/STUBS/mpi.c b/src/STUBS/mpi.c
index 718e5fb475..987479bb7b 100644
--- a/src/STUBS/mpi.c
+++ b/src/STUBS/mpi.c
@@ -20,7 +20,7 @@
 #include <sys/time.h>
 #include "mpi.h"
 
-/* lo-level data structure */
+/* data structure for double/int */
 
 struct _mpi_double_int {
   double value;
@@ -28,6 +28,15 @@ struct _mpi_double_int {
 };
 typedef struct _mpi_double_int double_int;
 
+/* extra MPI_Datatypes registered by MPI_Type_contiguous */
+
+#define MAXEXTRA_DATATYPE 16
+
+int nextra_datatype;
+MPI_Datatype *ptr_datatype[MAXEXTRA_DATATYPE];
+int index_datatype[MAXEXTRA_DATATYPE];
+int size_datatype[MAXEXTRA_DATATYPE];
+
 static int _mpi_is_initialized=0;
 
 /* ---------------------------------------------------------------------- */
@@ -135,6 +144,8 @@ double MPI_Wtime()
 
 /* ---------------------------------------------------------------------- */
 
+/* include sizes of user defined datatypes, stored in extra lists */
+
 static int stubtypesize(MPI_Datatype datatype)
 {
   if (datatype == MPI_INT)             return sizeof(int);
@@ -145,6 +156,12 @@ static int stubtypesize(MPI_Datatype datatype)
   else if (datatype == MPI_LONG)       return sizeof(long);
   else if (datatype == MPI_LONG_LONG)  return sizeof(uint64_t);
   else if (datatype == MPI_DOUBLE_INT) return sizeof(double_int);
+  else {
+    int i;
+    for (i = 0; i < nextra_datatype; i++)
+      if (datatype == index_datatype[i]) return size_datatype[i];
+  }
+  return 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -339,6 +356,66 @@ int MPI_Cart_rank(MPI_Comm comm, int *coords, int *rank)
 
 /* ---------------------------------------------------------------------- */
 
+/* store size of user datatype in extra lists */
+
+int MPI_Type_contiguous(int count, MPI_Datatype oldtype, 
+                        MPI_Datatype *newtype)
+{
+  if (nextra_datatype = MAXEXTRA_DATATYPE) return -1;
+  ptr_datatype[nextra_datatype] = newtype;
+  index_datatype[nextra_datatype] = -(nextra_datatype + 1);
+  size_datatype[nextra_datatype] = count * stubtypesize(oldtype);
+  nextra_datatype++;
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* set value of user datatype to internal negative index, 
+   based on match of ptr */
+
+int MPI_Type_commit(MPI_Datatype *datatype)
+{
+  int i;
+  for (i = 0; i < nextra_datatype; i++)
+    if (datatype == ptr_datatype[i]) *datatype = index_datatype[i];
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* remove user datatype from extra lists */
+
+int MPI_Type_free(MPI_Datatype *datatype)
+{
+  int i;
+  for (i = 0; i < nextra_datatype; i++)
+    if (datatype == ptr_datatype[i]) {
+      ptr_datatype[i] = ptr_datatype[nextra_datatype-1];
+      index_datatype[i] = index_datatype[nextra_datatype-1];
+      size_datatype[i] = size_datatype[nextra_datatype-1];
+      nextra_datatype--;
+      break;
+    }
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int MPI_Op_create(MPI_User_function *function, int commute, MPI_Op *op)
+{
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int MPI_Op_free(MPI_Op *op)
+{
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
 int MPI_Barrier(MPI_Comm comm) {return 0;}
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/STUBS/mpi.h b/src/STUBS/mpi.h
index 0d93326b45..07ea1027eb 100644
--- a/src/STUBS/mpi.h
+++ b/src/STUBS/mpi.h
@@ -64,6 +64,9 @@ extern "C" {
 
 #define MPI_MAX_PROCESSOR_NAME 128
 
+typedef void MPI_User_function(void *invec, void *inoutvec, 
+                               int *len, MPI_Datatype *datatype);
+
 /* MPI data structs */
 
 struct _MPI_Status {
@@ -123,6 +126,13 @@ int MPI_Cart_shift(MPI_Comm comm, int direction, int displ,
                    int *source, int *dest);
 int MPI_Cart_rank(MPI_Comm comm, int *coords, int *rank);
 
+int MPI_Type_contiguous(int count, MPI_Datatype oldtype, 
+                        MPI_Datatype *newtype);
+int MPI_Type_commit(MPI_Datatype *datatype);
+int MPI_Type_free(MPI_Datatype *datatype);
+
+int MPI_Op_create(MPI_User_function *function, int commute, MPI_Op *op);
+int MPI_Op_free(MPI_Op *op);
 
 int MPI_Barrier(MPI_Comm comm);
 int MPI_Bcast(void *buf, int count, MPI_Datatype datatype,
diff --git a/src/USER-CUDA/pppm_cuda.cpp b/src/USER-CUDA/pppm_cuda.cpp
index 9754dd1f94..2a2973c900 100644
--- a/src/USER-CUDA/pppm_cuda.cpp
+++ b/src/USER-CUDA/pppm_cuda.cpp
@@ -211,6 +211,9 @@ void PPPMCuda::init()
   // error check
 
   if (domain->dimension == 2) error->all(FLERR,"Cannot use PPPMCuda with 2d simulation");
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"PPPMCuda can only currently be used with "
+                        "comm_style brick");
 
   if (!atom->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
 
diff --git a/src/USER-LB/fix_lb_fluid.cpp b/src/USER-LB/fix_lb_fluid.cpp
index 0161fd658d..4706d576e4 100644
--- a/src/USER-LB/fix_lb_fluid.cpp
+++ b/src/USER-LB/fix_lb_fluid.cpp
@@ -87,6 +87,10 @@ FixLbFluid::FixLbFluid(LAMMPS *lmp, int narg, char **arg) :
 
   if(narg <7) error->all(FLERR,"Illegal fix lb/fluid command");
 
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Fix lb/fluid can only currently be used with "
+                        "comm_style brick");
+
   MPI_Comm_rank(world,&me);
   MPI_Comm_size(world,&nprocs);
 
@@ -566,9 +570,12 @@ int FixLbFluid::setmask()
 
 void FixLbFluid::init(void)
 {
-  
   int i,j;
 
+  if (comm->style != 0) 
+    error->universe_all(FLERR,"Fix lb/fluid can only currently be used with "
+                        "comm_style brick");
+
   //--------------------------------------------------------------------------
   // Check to see if the MD timestep has changed between runs.
   //--------------------------------------------------------------------------
diff --git a/src/atom.cpp b/src/atom.cpp
index ff2b15f4a0..78e317299c 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -47,6 +47,8 @@ using namespace MathConst;
 #define CUDA_CHUNK 3000
 #define MAXBODY 20       // max # of lines in one body, also in ReadData class
 
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
+
 /* ---------------------------------------------------------------------- */
 
 Atom::Atom(LAMMPS *lmp) : Pointers(lmp)
@@ -756,17 +758,33 @@ void Atom::data_atoms(int n, char *buf)
     sublo[2] = domain->sublo_lamda[2]; subhi[2] = domain->subhi_lamda[2];
   }
 
-  if (domain->xperiodic) {
-    if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
-    if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
-  }
-  if (domain->yperiodic) {
-    if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
-    if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
-  }
-  if (domain->zperiodic) {
-    if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
-    if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
+  if (comm->layout != LAYOUT_TILED) {
+    if (domain->xperiodic) {
+      if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
+      if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
+    }
+    if (domain->yperiodic) {
+      if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
+      if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
+    }
+    if (domain->zperiodic) {
+      if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
+      if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
+    }
+
+  } else {
+    if (domain->xperiodic) {
+      if (comm->mysplit[0][0] == 0.0) sublo[0] -= epsilon[0];
+      if (comm->mysplit[0][1] == 1.0) subhi[0] += epsilon[0];
+    }
+    if (domain->yperiodic) {
+      if (comm->mysplit[1][0] == 0.0) sublo[1] -= epsilon[1];
+      if (comm->mysplit[1][1] == 1.0) subhi[1] += epsilon[1];
+    }
+    if (domain->zperiodic) {
+      if (comm->mysplit[2][0] == 0.0) sublo[2] -= epsilon[2];
+      if (comm->mysplit[2][1] == 1.0) subhi[2] += epsilon[2];
+    }
   }
 
   // xptr = which word in line starts xyz coords
diff --git a/src/balance.cpp b/src/balance.cpp
index a76a7ce364..8675d8568e 100644
--- a/src/balance.cpp
+++ b/src/balance.cpp
@@ -21,6 +21,7 @@
 #include "balance.h"
 #include "atom.h"
 #include "comm.h"
+#include "rcb.h"
 #include "irregular.h"
 #include "domain.h"
 #include "force.h"
@@ -30,9 +31,10 @@
 
 using namespace LAMMPS_NS;
 
-enum{XYZ,SHIFT,RCB};
+enum{XYZ,SHIFT,BISECTION};
 enum{NONE,UNIFORM,USER};
 enum{X,Y,Z};
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
@@ -47,6 +49,8 @@ Balance::Balance(LAMMPS *lmp) : Pointers(lmp)
   user_xsplit = user_ysplit = user_zsplit = NULL;
   shift_allocate = 0;
 
+  rcb = NULL;
+
   fp = NULL;
   firststep = 1;
 }
@@ -74,6 +78,8 @@ Balance::~Balance()
     delete [] hisum;
   }
 
+  delete rcb;
+
   if (fp) fclose(fp);
 }
 
@@ -176,7 +182,7 @@ void Balance::command(int narg, char **arg)
 
     } else if (strcmp(arg[iarg],"rcb") == 0) {
       if (style != -1) error->all(FLERR,"Illegal balance command");
-      style = RCB;
+      style = BISECTION;
       iarg++;
 
     } else break;
@@ -232,6 +238,9 @@ void Balance::command(int narg, char **arg)
     }
   }
 
+  if (style == BISECTION && comm->style == 0) 
+    error->all(FLERR,"Balance rcb cannot be used with comm_style brick");
+
   // insure atoms are in current box & update box via shrink-wrap
   // init entire system since comm->setup is done
   // comm::init needs neighbor::init needs pair::init needs kspace::init, etc
@@ -251,8 +260,10 @@ void Balance::command(int narg, char **arg)
   double imbinit = imbalance_nlocal(maxinit);
 
   // no load-balance if imbalance doesn't exceed threshhold
+  // unless switching from tiled to non tiled layout, then force rebalance
 
-  if (imbinit < thresh) return;
+  if (comm->layout == LAYOUT_TILED && style != BISECTION) {
+  } else if (imbinit < thresh) return;
 
   // debug output of initial state
 
@@ -262,80 +273,65 @@ void Balance::command(int narg, char **arg)
 
   int niter = 0;
 
-  // NOTE: if using XYZ or SHIFT and current partition is TILING,
-  //   then need to create initial BRICK partition before performing LB
-
   // perform load-balance
   // style XYZ = explicit setting of cutting planes of logical 3d grid
 
   if (style == XYZ) {
+    if (comm->layout == LAYOUT_UNIFORM) {
+      if (xflag == USER || yflag == USER || zflag == USER)
+        comm->layout = LAYOUT_NONUNIFORM;
+    } else if (comm->style == LAYOUT_NONUNIFORM) {
+      if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
+        comm->layout = LAYOUT_UNIFORM;
+    } else if (comm->style == LAYOUT_TILED) {
+      if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
+        comm->layout = LAYOUT_UNIFORM;
+      else comm->layout = LAYOUT_NONUNIFORM;
+    }
+
     if (xflag == UNIFORM) {
       for (int i = 0; i < procgrid[0]; i++)
         comm->xsplit[i] = i * 1.0/procgrid[0];
       comm->xsplit[procgrid[0]] = 1.0;
-    }
+    } else if (xflag == USER)
+      for (int i = 0; i <= procgrid[0]; i++) comm->xsplit[i] = user_xsplit[i];
 
     if (yflag == UNIFORM) {
       for (int i = 0; i < procgrid[1]; i++)
         comm->ysplit[i] = i * 1.0/procgrid[1];
       comm->ysplit[procgrid[1]] = 1.0;
-    }
+    } else if (yflag == USER)
+      for (int i = 0; i <= procgrid[1]; i++) comm->ysplit[i] = user_ysplit[i];
 
     if (zflag == UNIFORM) {
       for (int i = 0; i < procgrid[2]; i++)
         comm->zsplit[i] = i * 1.0/procgrid[2];
       comm->zsplit[procgrid[2]] = 1.0;
-    }
-
-    if (xflag == USER)
-      for (int i = 0; i <= procgrid[0]; i++) comm->xsplit[i] = user_xsplit[i];
-
-    if (yflag == USER)
-      for (int i = 0; i <= procgrid[1]; i++) comm->ysplit[i] = user_ysplit[i];
-
-    if (zflag == USER)
+    } else if (zflag == USER)
       for (int i = 0; i <= procgrid[2]; i++) comm->zsplit[i] = user_zsplit[i];
   }
 
   // style SHIFT = adjust cutting planes of logical 3d grid
 
   if (style == SHIFT) {
-    static_setup(bstr);
+    comm->layout = LAYOUT_NONUNIFORM;
+    shift_setup_static(bstr);
     niter = shift();
   }
 
-  // style RCB = 
+  // style BISECTION = recursive coordinate bisectioning
 
-  if (style == RCB) {
-    error->all(FLERR,"Balance rcb is not yet supported");
-
-    if (comm->style == 0) 
-      error->all(FLERR,"Cannot use balance rcb with comm_style brick");
+  if (style == BISECTION) {
+    comm->layout = LAYOUT_TILED;
+    bisection(1);
   }
 
   // output of final result
 
   if (outflag && me == 0) dumpout(update->ntimestep,fp);
 
-  // reset comm->uniform flag if necessary
-
-  if (comm->uniform) {
-    if (style == SHIFT) comm->uniform = 0;
-    if (style == XYZ && xflag == USER) comm->uniform = 0;
-    if (style == XYZ && yflag == USER) comm->uniform = 0;
-    if (style == XYZ && zflag == USER) comm->uniform = 0;
-  } else {
-    if (dimension == 3) {
-      if (style == XYZ && 
-          xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
-        comm->uniform = 1;
-    } else {
-      if (style == XYZ && xflag == UNIFORM && yflag == UNIFORM)
-        comm->uniform = 1;
-    }
-  }
-
   // reset proc sub-domains
+  // for either brick or tiled comm style
 
   if (domain->triclinic) domain->set_lamda_box();
   domain->set_local_box();
@@ -344,7 +340,8 @@ void Balance::command(int narg, char **arg)
 
   if (domain->triclinic) domain->x2lamda(atom->nlocal);
   Irregular *irregular = new Irregular(lmp);
-  irregular->migrate_atoms(1);
+  if (style == BISECTION) irregular->migrate_atoms(1,rcb->sendproc);
+  else irregular->migrate_atoms(1);
   delete irregular;
   if (domain->triclinic) domain->lamda2x(atom->nlocal);
 
@@ -382,34 +379,36 @@ void Balance::command(int narg, char **arg)
     }
   }
 
-  if (me == 0) {
-    if (screen) {
-      fprintf(screen,"  x cuts:");
-      for (int i = 0; i <= comm->procgrid[0]; i++)
-        fprintf(screen," %g",comm->xsplit[i]);
-      fprintf(screen,"\n");
-      fprintf(screen,"  y cuts:");
-      for (int i = 0; i <= comm->procgrid[1]; i++)
-        fprintf(screen," %g",comm->ysplit[i]);
-      fprintf(screen,"\n");
-      fprintf(screen,"  z cuts:");
-      for (int i = 0; i <= comm->procgrid[2]; i++)
-        fprintf(screen," %g",comm->zsplit[i]);
-      fprintf(screen,"\n");
-    }
-    if (logfile) {
-      fprintf(logfile,"  x cuts:");
-      for (int i = 0; i <= comm->procgrid[0]; i++)
-        fprintf(logfile," %g",comm->xsplit[i]);
-      fprintf(logfile,"\n");
-      fprintf(logfile,"  y cuts:");
-      for (int i = 0; i <= comm->procgrid[1]; i++)
-        fprintf(logfile," %g",comm->ysplit[i]);
-      fprintf(logfile,"\n");
-      fprintf(logfile,"  z cuts:");
-      for (int i = 0; i <= comm->procgrid[2]; i++)
-        fprintf(logfile," %g",comm->zsplit[i]);
-      fprintf(logfile,"\n");
+  if (style != BISECTION) {
+    if (me == 0) {
+      if (screen) {
+        fprintf(screen,"  x cuts:");
+        for (int i = 0; i <= comm->procgrid[0]; i++)
+          fprintf(screen," %g",comm->xsplit[i]);
+        fprintf(screen,"\n");
+        fprintf(screen,"  y cuts:");
+        for (int i = 0; i <= comm->procgrid[1]; i++)
+          fprintf(screen," %g",comm->ysplit[i]);
+        fprintf(screen,"\n");
+        fprintf(screen,"  z cuts:");
+        for (int i = 0; i <= comm->procgrid[2]; i++)
+          fprintf(screen," %g",comm->zsplit[i]);
+        fprintf(screen,"\n");
+      }
+      if (logfile) {
+        fprintf(logfile,"  x cuts:");
+        for (int i = 0; i <= comm->procgrid[0]; i++)
+          fprintf(logfile," %g",comm->xsplit[i]);
+        fprintf(logfile,"\n");
+        fprintf(logfile,"  y cuts:");
+        for (int i = 0; i <= comm->procgrid[1]; i++)
+          fprintf(logfile," %g",comm->ysplit[i]);
+        fprintf(logfile,"\n");
+        fprintf(logfile,"  z cuts:");
+        for (int i = 0; i <= comm->procgrid[2]; i++)
+          fprintf(logfile," %g",comm->zsplit[i]);
+        fprintf(logfile,"\n");
+      }
     }
   }
 }
@@ -467,13 +466,59 @@ double Balance::imbalance_splits(int &max)
   return imbalance;
 }
 
+/* ----------------------------------------------------------------------
+   perform balancing via RCB class
+   sortflag = flag for sorting order of received messages by proc ID
+------------------------------------------------------------------------- */
+
+int *Balance::bisection(int sortflag)
+{
+  if (!rcb) rcb = new RCB(lmp);
+
+  // NOTE: lo/hi args could be simulation box or particle bounding box
+  //       if particle bbox, then mysplit needs to be reset to sim box
+  // NOTE: triclinic needs to be in lamda coords
+
+  int dim = domain->dimension;
+  double *boxlo = domain->boxlo;
+  double *boxhi = domain->boxhi;
+  double *prd = domain->prd;
+
+  rcb->compute(dim,atom->nlocal,atom->x,NULL,boxlo,boxhi);
+  rcb->invert(sortflag);
+  
+  // NOTE: this logic is specific to orthogonal boxes, not triclinic
+
+  comm->rcbnew = 1;
+  comm->rcbcut = rcb->cut;
+  comm->rcbcutdim = rcb->cutdim;
+
+  double (*mysplit)[2] = comm->mysplit;
+  
+  mysplit[0][0] = (rcb->lo[0] - boxlo[0]) / prd[0];
+  if (rcb->hi[0] == boxhi[0]) mysplit[0][1] = 1.0;
+  else mysplit[0][1] = (rcb->hi[0] - boxlo[0]) / prd[0];
+  
+  mysplit[1][0] = (rcb->lo[1] - boxlo[1]) / prd[1];
+  if (rcb->hi[1] == boxhi[1]) mysplit[1][1] = 1.0;
+  else mysplit[1][1] = (rcb->hi[1] - boxlo[1]) / prd[1];
+  
+  mysplit[2][0] = (rcb->lo[2] - boxlo[2]) / prd[2];
+  if (rcb->hi[2] == boxhi[2]) mysplit[2][1] = 1.0;
+  else mysplit[2][1] = (rcb->hi[2] - boxlo[2]) / prd[2];
+
+  // return list of procs to send my atoms to
+
+  return rcb->sendproc;
+}
+
 /* ----------------------------------------------------------------------
    setup static load balance operations
-   called from command
+   called from command and indirectly initially from fix balance
    set rho = 0 for static balancing
 ------------------------------------------------------------------------- */
 
-void Balance::static_setup(char *str)
+void Balance::shift_setup_static(char *str)
 {
   shift_allocate = 1;
 
@@ -498,21 +543,35 @@ void Balance::static_setup(char *str)
   losum = new bigint[max+1];
   hisum = new bigint[max+1];
 
+  // if current layout is TILED, set initial uniform splits in Comm
+  // this gives starting point to subsequent shift balancing
+
+  if (comm->layout == LAYOUT_TILED) {
+    int *procgrid = comm->procgrid;
+    double *xsplit = comm->xsplit;
+    double *ysplit = comm->ysplit;
+    double *zsplit = comm->zsplit;
+
+    for (int i = 0; i < procgrid[0]; i++) xsplit[i] = i * 1.0/procgrid[0];
+    for (int i = 0; i < procgrid[1]; i++) ysplit[i] = i * 1.0/procgrid[1];
+    for (int i = 0; i < procgrid[2]; i++) zsplit[i] = i * 1.0/procgrid[2];
+    xsplit[procgrid[0]] = ysplit[procgrid[1]] = zsplit[procgrid[2]] = 1.0;
+  }
+
   rho = 0;
 }
 
 /* ----------------------------------------------------------------------
    setup shift load balance operations
    called from fix balance
-   set rho = 1 for shift balancing after call to shift_setup()
+   set rho = 1 to do dynamic balancing after call to shift_setup_static()
 ------------------------------------------------------------------------- */
 
 void Balance::shift_setup(char *str, int nitermax_in, double thresh_in)
 {
-  static_setup(str);
+  shift_setup_static(str);
   nitermax = nitermax_in;
   stopthresh = thresh_in;
-
   rho = 1;
 }
 
@@ -525,7 +584,7 @@ void Balance::shift_setup(char *str, int nitermax_in, double thresh_in)
 int Balance::shift()
 {
   int i,j,k,m,np,max;
-  double *split = NULL;
+  double *split;
 
   // no balancing if no atoms
 
@@ -590,7 +649,7 @@ int Balance::shift()
     // iterate until balanced
 
 #ifdef BALANCE_DEBUG
-    if (me == 0) debug_output(idim,0,np,split);
+    if (me == 0) debug_shift_output(idim,0,np,split);
 #endif
 
     int doneflag;
@@ -601,7 +660,7 @@ int Balance::shift()
       niter++;
 
 #ifdef BALANCE_DEBUG
-      if (me == 0) debug_output(idim,m+1,np,split);
+      if (me == 0) debug_shift_output(idim,m+1,np,split);
       if (me == 0 && fp) dumpout(update->ntimestep,fp);
 #endif
 
@@ -827,7 +886,7 @@ void Balance::dumpout(bigint tstep, FILE *bfp)
 
     int nx = comm->procgrid[0] + 1;
     int ny = comm->procgrid[1] + 1;
-    //int nz = comm->procgrid[2] + 1;
+    int nz = comm->procgrid[2] + 1;
 
     if (dimension == 2) {
       int m = 0;
@@ -914,7 +973,7 @@ void Balance::dumpout(bigint tstep, FILE *bfp)
 ------------------------------------------------------------------------- */
 
 #ifdef BALANCE_DEBUG
-void Balance::debug_output(int idim, int m, int np, double *split)
+void Balance::debug_shift_output(int idim, int m, int np, double *split)
 {
   int i;
   const char *dim = NULL;
diff --git a/src/balance.h b/src/balance.h
index 66ad6db721..69ff41abaf 100644
--- a/src/balance.h
+++ b/src/balance.h
@@ -27,11 +27,14 @@ namespace LAMMPS_NS {
 
 class Balance : protected Pointers {
  public:
+  class RCB *rcb;
+
   Balance(class LAMMPS *);
   ~Balance();
   void command(int, char **);
   void shift_setup(char *, int, double);
   int shift();
+  int *bisection(int sortflag = 0);
   double imbalance_nlocal(int &);
   void dumpout(bigint, FILE *);
 
@@ -66,13 +69,13 @@ class Balance : protected Pointers {
   FILE *fp;
   int firststep;
 
-  void static_setup(char *);
   double imbalance_splits(int &);
+  void shift_setup_static(char *);
   void tally(int, int, double *);
   int adjust(int, double *);
   int binary(double, int, double *);
 #ifdef BALANCE_DEBUG
-  void debug_output(int, int, int, double *);
+  void debug_shift_output(int, int, int, double *);
 #endif
 };
 
diff --git a/src/comm.cpp b/src/comm.cpp
index a80db36d95..ad4b18786e 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -12,6 +12,7 @@
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
+#include "stdlib.h"
 #include "string.h"
 #include "comm.h"
 #include "universe.h"
@@ -20,9 +21,14 @@
 #include "domain.h"
 #include "group.h"
 #include "procmap.h"
+#include "accelerator_kokkos.h"
 #include "memory.h"
 #include "error.h"
 
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
 using namespace LAMMPS_NS;
 
 enum{SINGLE,MULTI};             // same as in Comm sub-styles
@@ -47,25 +53,96 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
   gridflag = ONELEVEL;
   mapflag = CART;
   customfile = NULL;
+  outfile = NULL;
   recv_from_partition = send_to_partition = -1;
   otherflag = 0;
-  outfile = NULL;
+  maxexchange_atom = maxexchange_fix = 0;
 
+  grid2proc = NULL;
   xsplit = ysplit = zsplit = NULL;
+  rcbnew = 0;
+
+  // use of OpenMP threads
+  // query OpenMP for number of threads/process set by user at run-time
+  // if the OMP_NUM_THREADS environment variable is not set, we default
+  // to using 1 thread. This follows the principle of the least surprise,
+  // while practically all OpenMP implementations violate it by using
+  // as many threads as there are (virtual) CPU cores by default.
+
+  nthreads = 1;
+#ifdef _OPENMP
+  if (lmp->kokkos) {
+    nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
+  } else if (getenv("OMP_NUM_THREADS") == NULL) {
+    nthreads = 1;
+    if (me == 0)
+      error->warning(FLERR,"OMP_NUM_THREADS environment is not set.");
+  } else {
+    nthreads = omp_get_max_threads();
+  }
+
+  // enforce consistent number of threads across all MPI tasks
+
+  MPI_Bcast(&nthreads,1,MPI_INT,0,world);
+  if (!lmp->kokkos) omp_set_num_threads(nthreads);
+
+  if (me == 0) {
+    if (screen)
+      fprintf(screen,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
+    if (logfile)
+      fprintf(logfile,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
+  }
+#endif
+
 }
 
 /* ---------------------------------------------------------------------- */
 
 Comm::~Comm()
 {
+  memory->destroy(grid2proc);
   memory->destroy(xsplit);
   memory->destroy(ysplit);
   memory->destroy(zsplit);
-
   delete [] customfile;
   delete [] outfile;
 }
 
+/* ----------------------------------------------------------------------
+   deep copy of arrays from old Comm class to new one
+   all public/protected vectors/arrays in parent Comm class must be copied
+   called from alternate constructor of child classes
+   when new comm style is created from Input
+------------------------------------------------------------------------- */
+
+void Comm::copy_arrays(Comm *oldcomm)
+{
+  if (oldcomm->grid2proc) {
+    memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
+                   "comm:grid2proc");
+    memcpy(&grid2proc[0][0][0],&oldcomm->grid2proc[0][0][0],
+           (procgrid[0]*procgrid[1]*procgrid[2])*sizeof(int));
+    
+    memory->create(xsplit,procgrid[0]+1,"comm:xsplit");
+    memory->create(ysplit,procgrid[1]+1,"comm:ysplit");
+    memory->create(zsplit,procgrid[2]+1,"comm:zsplit");
+    memcpy(xsplit,oldcomm->xsplit,(procgrid[0]+1)*sizeof(double));
+    memcpy(ysplit,oldcomm->ysplit,(procgrid[1]+1)*sizeof(double));
+    memcpy(zsplit,oldcomm->zsplit,(procgrid[2]+1)*sizeof(double));
+  }
+
+  if (customfile) {
+    int n = strlen(oldcomm->customfile) + 1;
+    customfile = new char[n];
+    strcpy(customfile,oldcomm->customfile);
+  }
+  if (outfile) {
+    int n = strlen(oldcomm->outfile) + 1;
+    outfile = new char[n];
+    strcpy(outfile,oldcomm->outfile);
+  }
+}
+
 /* ----------------------------------------------------------------------
    modify communication params
    invoked from input script by comm_modify command
diff --git a/src/comm.h b/src/comm.h
index e50def8c53..353ea131d1 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -21,21 +21,15 @@ namespace LAMMPS_NS {
 class Comm : protected Pointers {
  public:
   int style;     // comm pattern: 0 = 6-way stencil, 1 = irregular tiling
-  int layout;    // current proc domains: 0 = logical bricks, 1 = general tiling
-                 // can do style=1 on layout=0, but not vice versa
-  // NOTE: uniform needs to be subsumed into layout
-  int uniform;                      // 1 = equal subdomains, 0 = load-balanced
+  int layout;    // LAYOUT_UNIFORM = logical equal-sized bricks
+                 // LAYOUT_NONUNIFORM = logical bricks, 
+                 //                     but different sizes due to LB
+                 // LAYOUT_TILED = general tiling, due to RCB LB
 
   int me,nprocs;                    // proc info
-  int procgrid[3];                  // procs assigned in each dim of 3d grid
-  int user_procgrid[3];             // user request for procs in each dim
-  int myloc[3];                     // which proc I am in each dim
-  int procneigh[3][2];              // my 6 neighboring procs, 0/1 = left/right
   int ghost_velocity;               // 1 if ghost atoms have velocity, 0 if not
-  double *xsplit,*ysplit,*zsplit;   // fractional (0-1) sub-domain sizes
   double cutghost[3];               // cutoffs used for acquiring ghost atoms
   double cutghostuser;              // user-specified ghost cutoff
-  int ***grid2proc;                 // which proc owns i,j,k loc in 3d grid
   int recv_from_partition;          // recv proc layout from this partition
   int send_to_partition;            // send my proc layout to this partition
                                     // -1 if no recv or send
@@ -45,8 +39,27 @@ class Comm : protected Pointers {
   int maxexchange_fix;              // max contribution to exchange from Fixes
   int nthreads;                     // OpenMP threads per MPI process
 
+  // public settings specific to layout = UNIFORM, NONUNIFORM
+
+  int procgrid[3];                  // procs assigned in each dim of 3d grid
+  int user_procgrid[3];             // user request for procs in each dim
+  int myloc[3];                     // which proc I am in each dim
+  int procneigh[3][2];              // my 6 neighboring procs, 0/1 = left/right
+  double *xsplit,*ysplit,*zsplit;   // fractional (0-1) sub-domain sizes
+  int ***grid2proc;                 // which proc owns i,j,k loc in 3d grid
+
+  // public settings specific to layout = TILED
+
+  int rcbnew;                       // 1 if just reset by rebalance, else 0
+  double mysplit[3][2];             // fractional (0-1) bounds of my sub-domain
+  double rcbcut;                    // RCB cut by this proc
+  int rcbcutdim;                    // dimension of RCB cut
+
+  // methods
+
   Comm(class LAMMPS *);
   virtual ~Comm();
+  void copy_arrays(class Comm *);
   void modify_params(int, char **);
 
   void set_processors(int, char **);      // set 3d processor grid attributes
diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp
index 93a81c3ad2..70997725b0 100644
--- a/src/comm_brick.cpp
+++ b/src/comm_brick.cpp
@@ -22,6 +22,7 @@
 #include "stdio.h"
 #include "stdlib.h"
 #include "comm_brick.h"
+#include "comm_tiled.h"
 #include "universe.h"
 #include "atom.h"
 #include "atom_vec.h"
@@ -35,15 +36,10 @@
 #include "compute.h"
 #include "output.h"
 #include "dump.h"
-#include "accelerator_kokkos.h"
 #include "math_extra.h"
 #include "error.h"
 #include "memory.h"
 
-#ifdef _OPENMP
-#include "omp.h"
-#endif
-
 using namespace LAMMPS_NS;
 
 #define BUFFACTOR 1.5
@@ -52,58 +48,57 @@ using namespace LAMMPS_NS;
 #define BIG 1.0e20
 
 enum{SINGLE,MULTI};               // same as in Comm
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
 CommBrick::CommBrick(LAMMPS *lmp) : Comm(lmp)
 {
-  style = layout = 0;
+  style = 0;
+  layout = LAYOUT_UNIFORM;
+  init_buffers();
+}
 
-  recv_from_partition = send_to_partition = -1;
-  otherflag = 0;
+/* ---------------------------------------------------------------------- */
 
-  grid2proc = NULL;
+CommBrick::~CommBrick()
+{
+  free_swap();
+  if (mode == MULTI) {
+    free_multi();
+    memory->destroy(cutghostmulti);
+  }
 
-  uniform = 1;
+  if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
+  memory->sfree(sendlist);
+  memory->destroy(maxsendlist);
+
+  memory->destroy(buf_send);
+  memory->destroy(buf_recv);
+}
+
+/* ---------------------------------------------------------------------- */
+
+CommBrick::CommBrick(LAMMPS *lmp, Comm *oldcomm) : Comm(*oldcomm)
+{
+  if (oldcomm->layout == LAYOUT_TILED)
+    error->all(FLERR,"Cannot change to comm_style brick from tiled layout");
+
+  style = 0;
+  layout = oldcomm->layout;
+  copy_arrays(oldcomm);
+  init_buffers();
+}
+
+/* ----------------------------------------------------------------------
+   initialize comm buffers and other data structs local to CommBrick
+------------------------------------------------------------------------- */
+
+void CommBrick::init_buffers()
+{
   multilo = multihi = NULL;
   cutghostmulti = NULL;
 
-  // use of OpenMP threads
-  // query OpenMP for number of threads/process set by user at run-time
-  // if the OMP_NUM_THREADS environment variable is not set, we default
-  // to using 1 thread. This follows the principle of the least surprise,
-  // while practically all OpenMP implementations violate it by using
-  // as many threads as there are (virtual) CPU cores by default.
-
-  nthreads = 1;
-#ifdef _OPENMP
-  if (lmp->kokkos) {
-    nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
-  } else if (getenv("OMP_NUM_THREADS") == NULL) {
-    nthreads = 1;
-    if (me == 0)
-      error->warning(FLERR,"OMP_NUM_THREADS environment is not set.");
-  } else {
-    nthreads = omp_get_max_threads();
-  }
-
-  // enforce consistent number of threads across all MPI tasks
-
-  MPI_Bcast(&nthreads,1,MPI_INT,0,world);
-  if (!lmp->kokkos) omp_set_num_threads(nthreads);
-
-  if (me == 0) {
-    if (screen)
-      fprintf(screen,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
-    if (logfile)
-      fprintf(logfile,"  using %d OpenMP thread(s) per MPI task\n",nthreads);
-  }
-#endif
-
-  // initialize comm buffers & exchange memory
-  // NOTE: allow for AtomVec to set maxexchange_atom, e.g. for atom_style body
-
-  maxexchange_atom = maxexchange_fix = 0;
   maxexchange = maxexchange_atom + maxexchange_fix;
   bufextra = maxexchange + BUFEXTRA;
 
@@ -125,26 +120,6 @@ CommBrick::CommBrick(LAMMPS *lmp) : Comm(lmp)
 
 /* ---------------------------------------------------------------------- */
 
-CommBrick::~CommBrick()
-{
-  memory->destroy(grid2proc);
-
-  free_swap();
-  if (mode == MULTI) {
-    free_multi();
-    memory->destroy(cutghostmulti);
-  }
-
-  if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
-  memory->sfree(sendlist);
-  memory->destroy(maxsendlist);
-
-  memory->destroy(buf_send);
-  memory->destroy(buf_recv);
-}
-
-/* ---------------------------------------------------------------------- */
-
 void CommBrick::init()
 {
   triclinic = domain->triclinic;
@@ -280,12 +255,12 @@ void CommBrick::setup()
   //   0 = to left, 1 = to right
   //   set equal to recvneed[idim][1/0] of neighbor proc
   // maxneed[idim] = max procs away any proc recvs atoms in either direction
-  // uniform = 1 = uniform sized sub-domains:
+  // layout = UNIFORM = uniform sized sub-domains:
   //   maxneed is directly computable from sub-domain size
   //     limit to procgrid-1 for non-PBC
   //   recvneed = maxneed except for procs near non-PBC
   //   sendneed = recvneed of neighbor on each side
-  // uniform = 0 = non-uniform sized sub-domains:
+  // layout = NONUNIFORM = non-uniform sized sub-domains:
   //   compute recvneed via updown() which accounts for non-PBC
   //   sendneed = recvneed of neighbor on each side
   //   maxneed via Allreduce() of recvneed
@@ -293,7 +268,7 @@ void CommBrick::setup()
   int *periodicity = domain->periodicity;
   int left,right;
 
-  if (uniform) {
+  if (layout == LAYOUT_UNIFORM) {
     maxneed[0] = static_cast<int> (cutghost[0] * procgrid[0] / prd[0]) + 1;
     maxneed[1] = static_cast<int> (cutghost[1] * procgrid[1] / prd[1]) + 1;
     maxneed[2] = static_cast<int> (cutghost[2] * procgrid[2] / prd[2]) + 1;
@@ -561,16 +536,15 @@ void CommBrick::forward_comm(int dummy)
     } else {
       if (comm_x_only) {
         if (sendnum[iswap])
-          n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                              x[firstrecv[iswap]],pbc_flag[iswap],
-                              pbc[iswap]);
+          avec->pack_comm(sendnum[iswap],sendlist[iswap],
+                          x[firstrecv[iswap]],pbc_flag[iswap],pbc[iswap]);
       } else if (ghost_velocity) {
-        n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
-                                buf_send,pbc_flag[iswap],pbc[iswap]);
+        avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
+                            buf_send,pbc_flag[iswap],pbc[iswap]);
         avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send);
       } else {
-        n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
+        avec->pack_comm(sendnum[iswap],sendlist[iswap],
+                        buf_send,pbc_flag[iswap],pbc[iswap]);
         avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
       }
     }
@@ -620,10 +594,10 @@ void CommBrick::reverse_comm()
     } else {
       if (comm_f_only) {
         if (sendnum[iswap])
-            avec->unpack_reverse(sendnum[iswap],sendlist[iswap],
-                                f[firstrecv[iswap]]);
+          avec->unpack_reverse(sendnum[iswap],sendlist[iswap],
+                               f[firstrecv[iswap]]);
       } else {
-        n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
+        avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
         avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_send);
       }
     }
diff --git a/src/comm_brick.h b/src/comm_brick.h
index 20592accab..8dafecfbe7 100644
--- a/src/comm_brick.h
+++ b/src/comm_brick.h
@@ -21,6 +21,7 @@ namespace LAMMPS_NS {
 class CommBrick : public Comm {
  public:
   CommBrick(class LAMMPS *);
+  CommBrick(class LAMMPS *, class Comm *);
   virtual ~CommBrick();
 
   virtual void init();
@@ -80,6 +81,7 @@ class CommBrick : public Comm {
   int maxexchange;                  // max # of datums/atom in exchange comm 
   int bufextra;                     // extra space beyond maxsend in send buffer
 
+  void init_buffers();
   int updown(int, int, int, double, int, double *);
                                             // compare cutoff to procs
   virtual void grow_send(int, int);         // reallocate send buffer
diff --git a/src/comm_tiled.cpp b/src/comm_tiled.cpp
index 9f111b999a..42f6efdb6c 100644
--- a/src/comm_tiled.cpp
+++ b/src/comm_tiled.cpp
@@ -11,15 +11,20 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+#include "string.h"
 #include "lmptype.h"
 #include "comm_tiled.h"
+#include "comm_brick.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "domain.h"
+#include "force.h"
 #include "pair.h"
+#include "neighbor.h"
 #include "modify.h"
 #include "fix.h"
 #include "compute.h"
+#include "output.h"
 #include "dump.h"
 #include "memory.h"
 #include "error.h"
@@ -27,32 +32,88 @@
 using namespace LAMMPS_NS;
 
 #define BUFFACTOR 1.5
+#define BUFFACTOR 1.5
+#define BUFMIN 1000
+#define BUFEXTRA 1000
+
+// NOTE: change this to 16 after debugged
+
+#define DELTA_PROCS 1
 
 enum{SINGLE,MULTI};               // same as in Comm
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
 CommTiled::CommTiled(LAMMPS *lmp) : Comm(lmp)
 {
-  style = 1;
-  layout = 0;
-
   error->all(FLERR,"Comm_style tiled is not yet supported");
+
+  style = 1;
+  layout = LAYOUT_UNIFORM;
+  init_buffers();
+}
+
+/* ---------------------------------------------------------------------- */
+
+CommTiled::CommTiled(LAMMPS *lmp, Comm *oldcomm) : Comm(*oldcomm)
+{
+  error->all(FLERR,"Comm_style tiled is not yet supported");
+
+  style = 1;
+  layout = oldcomm->layout;
+  copy_arrays(oldcomm);
+  init_buffers();
 }
 
 /* ---------------------------------------------------------------------- */
 
 CommTiled::~CommTiled()
 {
+  memory->destroy(buf_send);
+  memory->destroy(buf_recv);
+  memory->destroy(overlap);
+  deallocate_swap(nswap);
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   initialize comm buffers and other data structs local to CommTiled
+   NOTE: if this is identical to CommBrick, put it into Comm ??
+------------------------------------------------------------------------- */
+
+void CommTiled::init_buffers()
+{
+  maxexchange = maxexchange_atom + maxexchange_fix;
+  bufextra = maxexchange + BUFEXTRA;
+
+  maxsend = BUFMIN;
+  memory->create(buf_send,maxsend+bufextra,"comm:buf_send");
+  maxrecv = BUFMIN;
+  memory->create(buf_recv,maxrecv,"comm:buf_recv");
+
+  maxoverlap = 0;
+  overlap = NULL;
+
+  nswap = 2 * domain->dimension;
+  allocate_swap(nswap);
+}
+
+/* ----------------------------------------------------------------------
+   NOTE: if this is nearly identical to CommBrick, put it into Comm ??
+------------------------------------------------------------------------- */
 
 void CommTiled::init()
 {
   triclinic = domain->triclinic;
   map_style = atom->map_style;
 
+  // temporary restrictions
+
+  if (triclinic) 
+    error->all(FLERR,"Cannot yet use comm_style tiled with triclinic box");
+  if (mode == MULTI)
+    error->all(FLERR,"Cannot yet use comm_style tiled with multi-mode comm");
+
   // comm_only = 1 if only x,f are exchanged in forward/reverse comm
   // comm_x_only = 0 if ghost_velocity since velocities are added
 
@@ -72,51 +133,250 @@ void CommTiled::init()
 
   for (int i = 0; i < modify->nfix; i++)
     size_border += modify->fix[i]->comm_border;
+
+  // maxexchange = max # of datums/atom in exchange communication
+  // maxforward = # of datums in largest forward communication
+  // maxreverse = # of datums in largest reverse communication
+  // query pair,fix,compute,dump for their requirements
+  // pair style can force reverse comm even if newton off
+
+  maxexchange = BUFMIN + maxexchange_fix;
+  maxforward = MAX(size_forward,size_border);
+  maxreverse = size_reverse;
+
+  if (force->pair) maxforward = MAX(maxforward,force->pair->comm_forward);
+  if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse);
+
+  for (int i = 0; i < modify->nfix; i++) {
+    maxforward = MAX(maxforward,modify->fix[i]->comm_forward);
+    maxreverse = MAX(maxreverse,modify->fix[i]->comm_reverse);
+  }
+
+  for (int i = 0; i < modify->ncompute; i++) {
+    maxforward = MAX(maxforward,modify->compute[i]->comm_forward);
+    maxreverse = MAX(maxreverse,modify->compute[i]->comm_reverse);
+  }
+
+  for (int i = 0; i < output->ndump; i++) {
+    maxforward = MAX(maxforward,output->dump[i]->comm_forward);
+    maxreverse = MAX(maxreverse,output->dump[i]->comm_reverse);
+  }
+
+  if (force->newton == 0) maxreverse = 0;
+  if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse_off);
 }
 
 /* ----------------------------------------------------------------------
    setup spatial-decomposition communication patterns
-   function of neighbor cutoff(s) & cutghostuser & current box size
-   single mode sets slab boundaries (slablo,slabhi) based on max cutoff
-   multi mode sets type-dependent slab boundaries (multilo,multihi)
+   function of neighbor cutoff(s) & cutghostuser & current box size and tiling
 ------------------------------------------------------------------------- */
 
 void CommTiled::setup()
 {
-  // error on triclinic or multi?
-  // set nswap = 2*dim
-  // setup neighbor proc info for exchange()
-  // setup nsendproc and nrecvproc bounts
-  // setup sendproc and recvproc lists
-  // setup sendboxes
-  // reallocate requests and statuses
+  int i;
 
-  // check that cutoff is <= 1/2 of periodic box len?
+  // domain properties used in setup and methods it calls
 
-  // loop over dims
-  //  left:
-  //    construct ghost boxes
-  //      differnet in x,y,z
-  //      account for ghost borders in y,z
-  //      account for PBC by shifting
-  //      split into multiple boxes if straddles PBC
-  //    drop boxes down RCB tree
-  //      count unique procs they cover
-  //      what about self if crosses PBC
-  //      for each proc they cover:
-  //        compute box I send it to left
-  //        is a message I will recv from right (don't care about box)
-  //    for ghost-extended boxes
-  //      do not count procs that do not overlap my owned box at all
-  //      only touching edge of my owned box does not count
-  //      in this case list I send to and recv from may be different?
-  //  same thing to right
+  prd = domain->prd;
+  boxlo = domain->boxlo;
+  boxhi = domain->boxhi;
+  sublo = domain->sublo;
+  subhi = domain->subhi;
 
-  // what need from decomp (RCB):
-  // dropbox: return list of procs with overlap and overlapping boxes
-  //   return n, proclist, boxlist
-  // otherbox: bbox of another proc
-  // dropatom: return what proc owns the atom coord
+  int dimension = domain->dimension;
+  int *periodicity = domain->periodicity;
+
+  // set function pointers
+
+  if (layout != LAYOUT_TILED) {
+    box_drop = &CommTiled::box_drop_brick;
+    box_other = &CommTiled::box_other_brick;
+  } else {
+    box_drop = &CommTiled::box_drop_tiled;
+    box_other = &CommTiled::box_other_tiled;
+  }
+
+  // if RCB decomp has just changed, gather needed global RCB info
+
+  if (rcbnew) {
+    rcbnew = 0;
+    memcpy(&rcbinfo[me].mysplit[0][0],&mysplit[0][0],6*sizeof(double));
+    rcbinfo[me].cut = rcbcut;
+    rcbinfo[me].dim = rcbcutdim;
+    MPI_Allgather(&rcbinfo[me],sizeof(RCBinfo),MPI_CHAR,
+                  rcbinfo,sizeof(RCBinfo),MPI_CHAR,world);
+  }
+
+  // check that cutoff < any periodic box length
+
+  double cut = MAX(neighbor->cutneighmax,cutghostuser);
+  cutghost[0] = cutghost[1] = cutghost[2] = cut;
+  
+  if ((periodicity[0] && cut > prd[0]) ||
+      (periodicity[1] && cut > prd[1]) ||
+      (dimension == 3 && periodicity[2] && cut > prd[2]))
+    error->all(FLERR,"Communication cutoff for comm_style tiled "
+               "cannot exceed periodic box length");
+
+  // loop over 6 swap directions
+  // determine which procs I will send to and receive from in each swap
+  // done by intersecting ghost box with all proc sub-boxes it overlaps
+  // sets nsendproc, nrecvproc, sendproc, recvproc
+  // sets sendother, sendself, pbc_flag, pbc, sendbox
+
+  int noverlap1,indexme;
+  double lo1[3],hi1[3],lo2[3],hi2[3];
+  int one,two;
+
+  nswap = 0;
+  for (int idim = 0; idim < dimension; idim++) {
+    for (int iswap = 0; iswap < 2; iswap++) {
+
+      // one = first ghost box in same periodic image
+      // two = second ghost box wrapped across periodic boundary
+      // either may not exist
+
+      one = 1;
+      lo1[0] = sublo[0]; lo1[1] = sublo[1]; lo1[2] = sublo[2];
+      hi1[0] = subhi[0]; hi1[1] = subhi[1]; hi1[2] = subhi[2];
+      if (iswap == 0) {
+        lo1[idim] = sublo[idim] - cut;
+        hi1[idim] = sublo[idim];
+      } else {
+        lo1[idim] = subhi[idim];
+        hi1[idim] = subhi[idim] + cut;
+      }
+      
+      two = 0;
+      if (iswap == 0 && periodicity[idim] && lo1[idim] < boxlo[idim]) two = 1;
+      if (iswap == 1 && periodicity[idim] && hi1[idim] > boxhi[idim]) two = 1;
+
+      if (two) {
+        lo2[0] = sublo[0]; lo2[1] = sublo[1]; lo2[2] = sublo[2];
+        hi2[0] = subhi[0]; hi2[1] = subhi[1]; hi2[2] = subhi[2];
+        if (iswap == 0) {
+          lo2[idim] = lo1[idim] + prd[idim];
+          hi2[idim] = hi1[idim] + prd[idim];
+          if (sublo[idim] == boxlo[idim]) {
+            one = 0;
+            hi2[idim] = boxhi[idim];
+          }
+        } else {
+          lo2[idim] = lo1[idim] - prd[idim];
+          hi2[idim] = hi1[idim] - prd[idim];
+          if (subhi[idim] == boxhi[idim]) {
+            one = 0;
+            lo2[idim] = boxlo[idim];
+          }
+        }
+      }
+
+      // noverlap = # of overlaps of box1/2 with procs via box_drop()
+      // overlap = list of overlapping procs
+      // if overlap with self, indexme = index of me in list
+
+      indexme = -1;
+      noverlap = 0;
+      if (one) (this->*box_drop)(idim,lo1,hi1,indexme);
+      noverlap1 = noverlap;
+      if (two) (this->*box_drop)(idim,lo2,hi2,indexme);
+
+      // if self is in overlap list, move it to end of list
+
+      if (indexme >= 0) {
+        int tmp = overlap[noverlap-1];
+        overlap[noverlap-1] = overlap[indexme];
+        overlap[indexme] = tmp;
+      }
+
+      // reallocate 2nd dimensions of all send/recv arrays, based on noverlap
+      // # of sends of this swap = # of recvs of nswap +/- 1
+
+      if (noverlap > nprocmax[iswap]) {
+        int oldmax = nprocmax[iswap];
+        while (nprocmax[iswap] < noverlap) nprocmax[iswap] += DELTA_PROCS;
+        grow_swap_send(iswap,nprocmax[iswap],oldmax);
+        if (iswap == 0) grow_swap_recv(iswap+1,nprocmax[iswap]);
+        else grow_swap_recv(iswap-1,nprocmax[iswap]);
+      }
+
+      // overlap how has list of noverlap procs
+      // includes PBC effects
+
+      if (overlap[noverlap-1] == me) sendself[nswap] = 1;
+      else sendself[nswap] = 0;
+      if (noverlap-sendself[nswap]) sendother[nswap] = 1;
+      else sendother[nswap] = 0;
+
+      nsendproc[nswap] = noverlap;
+      for (i = 0; i < noverlap; i++) sendproc[nswap][i] = overlap[i];
+      if (iswap == 0) {
+        nrecvproc[nswap+1] = noverlap;
+        for (i = 0; i < noverlap; i++) recvproc[nswap+1][i] = overlap[i];
+      } else {
+        nrecvproc[nswap-1] = noverlap;
+        for (i = 0; i < noverlap; i++) recvproc[nswap-1][i] = overlap[i];
+      }
+
+      // compute sendbox for each of my sends
+      // obox = intersection of ghostbox with other proc's sub-domain
+      // sbox = what I need to send to other proc
+      //      = sublo to MIN(sublo+cut,subhi) in idim, for iswap = 0
+      //      = MIN(subhi-cut,sublo) to subhi in idim, for iswap = 1
+      //      = obox in other 2 dims
+      // if sbox touches sub-box boundaries in lower dims,
+      //   extend sbox in those lower dims to include ghost atoms
+      
+      double oboxlo[3],oboxhi[3],sbox[6];
+
+      for (i = 0; i < noverlap; i++) {
+        pbc_flag[nswap][i] = 0;
+        pbc[nswap][i][0] = pbc[nswap][i][1] = pbc[nswap][i][2] =
+          pbc[nswap][i][3] = pbc[nswap][i][4] = pbc[nswap][i][5] = 0;
+        
+        (this->*box_other)(idim,iswap,overlap[i],oboxlo,oboxhi);
+        
+        if (i < noverlap1) {
+          sbox[0] = MAX(oboxlo[0],lo1[0]);
+          sbox[1] = MAX(oboxlo[1],lo1[1]);
+          sbox[2] = MAX(oboxlo[2],lo1[2]);
+          sbox[3] = MIN(oboxhi[0],hi1[0]);
+          sbox[4] = MIN(oboxhi[1],hi1[1]);
+          sbox[5] = MIN(oboxhi[2],hi1[2]);
+        } else {
+          pbc_flag[nswap][i] = 1;
+          pbc[nswap][i][idim] = 1;
+          sbox[0] = MAX(oboxlo[0],lo2[0]);
+          sbox[1] = MAX(oboxlo[1],lo2[1]);
+          sbox[2] = MAX(oboxlo[2],lo2[2]);
+          sbox[3] = MIN(oboxhi[0],hi2[0]);
+          sbox[4] = MIN(oboxhi[1],hi2[1]);
+          sbox[5] = MIN(oboxhi[2],hi2[2]);
+        }
+
+        if (iswap == 0) {
+          sbox[idim] = sublo[idim];
+          sbox[3+idim] = MIN(sublo[idim]+cut,subhi[idim]);
+        } else {
+          sbox[idim] = MAX(subhi[idim]-cut,sublo[idim]);
+          sbox[3+idim] = subhi[idim];
+        }
+
+        if (idim >= 1) {
+          if (sbox[0] == sublo[0]) sbox[0] -= cut;
+          if (sbox[4] == subhi[0]) sbox[4] += cut;
+        }
+        if (idim == 2) {
+          if (sbox[1] == sublo[1]) sbox[1] -= cut;
+          if (sbox[5] == subhi[1]) sbox[5] += cut;
+        }
+        
+        memcpy(sendbox[nswap][i],sbox,6*sizeof(double));
+      }
+
+      nswap++;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -126,46 +386,73 @@ void CommTiled::setup()
 
 void CommTiled::forward_comm(int dummy)
 {
-  int i,irecv,n;
+  int i,irecv,n,nsend,nrecv;
   MPI_Status status;
   AtomVec *avec = atom->avec;
   double **x = atom->x;
 
   // exchange data with another set of procs in each swap
-  // if first proc in set is self, then is just self across PBC, just copy
+  // post recvs from all procs except self
+  // send data to all procs except self
+  // copy data to self if sendself is set
+  // wait on all procs except self and unpack received data
   // if comm_x_only set, exchange or copy directly to x, don't unpack
 
   for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap][0] != me) {
-      if (comm_x_only) {
-        for (i = 0; i < nrecvproc[iswap]; i++)
+    nsend = nsendproc[iswap] - sendself[iswap];
+    nrecv = nrecvproc[iswap] - sendself[iswap];
+
+    if (comm_x_only) {
+      if (sendother[iswap]) {
+        for (i = 0; i < nrecv; i++)
           MPI_Irecv(x[firstrecv[iswap][i]],size_forward_recv[iswap][i],
                     MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-        for (i = 0; i < nsendproc[iswap]; i++) {
+        for (i = 0; i < nsend; i++) {
           n = avec->pack_comm(sendnum[iswap][i],sendlist[iswap][i],
                               buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
           MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap][i],0,world);
         }
-        MPI_Waitall(nrecvproc[iswap],requests,statuses);
+      }
 
-      } else if (ghost_velocity) {
-        for (i = 0; i < nrecvproc[iswap]; i++)
+      if (sendself[iswap]) {
+        avec->pack_comm(sendnum[iswap][nsend],sendlist[iswap][nsend],
+                        x[firstrecv[iswap][nrecv]],pbc_flag[iswap][nsend],
+                        pbc[iswap][nsend]);
+      }
+
+      if (sendother[iswap]) MPI_Waitall(nrecv,requests,statuses);
+
+    } else if (ghost_velocity) {
+      if (sendother[iswap]) {
+        for (i = 0; i < nrecv; i++)
           MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
                     size_forward_recv[iswap][i],
                     MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-        for (i = 0; i < nsendproc[iswap]; i++) {
+        for (i = 0; i < nsend; i++) {
           n = avec->pack_comm_vel(sendnum[iswap][i],sendlist[iswap][i],
                                   buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
           MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap][i],0,world);
         }
-        for (i = 0; i < nrecvproc[iswap]; i++) {
-          MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
-          avec->unpack_comm_vel(recvnum[iswap][i],firstrecv[iswap][irecv],
+      }
+
+      if (sendself[iswap]) {
+        avec->pack_comm_vel(sendnum[iswap][nsend],sendlist[iswap][nsend],
+                            buf_send,pbc_flag[iswap][nsend],pbc[iswap][nsend]);
+        avec->unpack_comm_vel(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
+                              buf_send);
+      }
+
+      if (sendother[iswap]) {
+        for (i = 0; i < nrecv; i++) {
+          MPI_Waitany(nrecv,requests,&irecv,&status);
+          avec->unpack_comm_vel(recvnum[iswap][irecv],firstrecv[iswap][irecv],
                                 &buf_recv[forward_recv_offset[iswap][irecv]]);
         }
+      }
 
-      } else {
-        for (i = 0; i < nrecvproc[iswap]; i++)
+    } else {
+      if (sendother[iswap]) {
+        for (i = 0; i < nrecv; i++)
           MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
                     size_forward_recv[iswap][i],
                     MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
@@ -174,27 +461,21 @@ void CommTiled::forward_comm(int dummy)
                               buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
           MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap][i],0,world);
         }
-        for (i = 0; i < nrecvproc[iswap]; i++) {
-          MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
-          avec->unpack_comm(recvnum[iswap][i],firstrecv[iswap][irecv],
-                            &buf_recv[forward_recv_offset[iswap][irecv]]);
-        }
       }
 
-    } else {
-      if (comm_x_only) {
-        if (sendnum[iswap][0])
-          n = avec->pack_comm(sendnum[iswap][0],sendlist[iswap][0],
-                              x[firstrecv[iswap][0]],pbc_flag[iswap][0],
-                              pbc[iswap][0]);
-      } else if (ghost_velocity) {
-        n = avec->pack_comm_vel(sendnum[iswap][0],sendlist[iswap][0],
-                                buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
-        avec->unpack_comm_vel(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
-      } else {
-        n = avec->pack_comm(sendnum[iswap][0],sendlist[iswap][0],
-                            buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
-        avec->unpack_comm(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
+      if (sendself[iswap]) {
+        avec->pack_comm(sendnum[iswap][nsend],sendlist[iswap][nsend],
+                        buf_send,pbc_flag[iswap][nsend],pbc[iswap][nsend]);
+        avec->unpack_comm(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
+                          buf_send);
+      }
+
+      if (sendother[iswap]) {
+        for (i = 0; i < nrecv; i++) {
+          MPI_Waitany(nrecv,requests,&irecv,&status);
+          avec->unpack_comm(recvnum[iswap][irecv],firstrecv[iswap][irecv],
+                            &buf_recv[forward_recv_offset[iswap][irecv]]);
+        }
       }
     }
   }
@@ -207,57 +488,73 @@ void CommTiled::forward_comm(int dummy)
 
 void CommTiled::reverse_comm()
 {
-  int i,irecv,n;
+  int i,irecv,n,nsend,nrecv;
   MPI_Request request;
   MPI_Status status;
   AtomVec *avec = atom->avec;
   double **f = atom->f;
 
   // exchange data with another set of procs in each swap
-  // if first proc in set is self, then is just self across PBC, just copy
+  // post recvs from all procs except self
+  // send data to all procs except self
+  // copy data to self if sendself is set
+  // wait on all procs except self and unpack received data
   // if comm_f_only set, exchange or copy directly from f, don't pack
 
   for (int iswap = nswap-1; iswap >= 0; iswap--) {
-    if (sendproc[iswap][0] != me) {
-      if (comm_f_only) {
-        for (i = 0; i < nsendproc[iswap]; i++)
+    nsend = nsendproc[iswap] - sendself[iswap];
+    nrecv = nrecvproc[iswap] - sendself[iswap];
+
+    if (comm_f_only) {
+      if (sendother[iswap]) {
+        for (i = 0; i < nsend; i++)
           MPI_Irecv(&buf_recv[reverse_recv_offset[iswap][i]],
                     size_reverse_recv[iswap][i],MPI_DOUBLE,
                     sendproc[iswap][i],0,world,&requests[i]);
-        for (i = 0; i < nrecvproc[iswap]; i++)
+        for (i = 0; i < nrecv; i++)
           MPI_Send(f[firstrecv[iswap][i]],size_reverse_send[iswap][i],
                    MPI_DOUBLE,recvproc[iswap][i],0,world);
-        for (i = 0; i < nsendproc[iswap]; i++) {
-          MPI_Waitany(nsendproc[iswap],requests,&irecv,&status);
-          avec->unpack_reverse(sendnum[iswap][irecv],sendlist[iswap][irecv],
-                               &buf_recv[reverse_recv_offset[iswap][irecv]]);
-        }
+      }
 
-      } else {
-        for (i = 0; i < nsendproc[iswap]; i++)
-          MPI_Irecv(&buf_recv[reverse_recv_offset[iswap][i]],
-                    size_reverse_recv[iswap][i],MPI_DOUBLE,
-                    sendproc[iswap][i],0,world,&requests[i]);
-        for (i = 0; i < nrecvproc[iswap]; i++) {
-          n = avec->pack_reverse(recvnum[iswap][i],firstrecv[iswap][i],
-                                 buf_send);
-          MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap][i],0,world);
-        }
-        for (i = 0; i < nsendproc[iswap]; i++) {
-          MPI_Waitany(nsendproc[iswap],requests,&irecv,&status);
+      if (sendself[iswap]) {
+        avec->unpack_reverse(sendnum[iswap][nsend],sendlist[iswap][nsend],
+                             f[firstrecv[iswap][nrecv]]);
+      }
+
+      if (sendother[iswap]) {
+        for (i = 0; i < nsend; i++) {
+          MPI_Waitany(nsend,requests,&irecv,&status);
           avec->unpack_reverse(sendnum[iswap][irecv],sendlist[iswap][irecv],
                                &buf_recv[reverse_recv_offset[iswap][irecv]]);
         }
       }
-
+      
     } else {
-      if (comm_f_only) {
-        if (sendnum[iswap][0])
-          avec->unpack_reverse(sendnum[iswap][0],sendlist[iswap][0],
-                               f[firstrecv[iswap][0]]);
-      } else {
-        n = avec->pack_reverse(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
-        avec->unpack_reverse(sendnum[iswap][0],sendlist[iswap][0],buf_send);
+      if (sendother[iswap]) {
+        for (i = 0; i < nsend; i++)
+          MPI_Irecv(&buf_recv[reverse_recv_offset[iswap][i]],
+                    size_reverse_recv[iswap][i],MPI_DOUBLE,
+                    sendproc[iswap][i],0,world,&requests[i]);
+        for (i = 0; i < nrecv; i++) {
+          n = avec->pack_reverse(recvnum[iswap][i],firstrecv[iswap][i],
+                                 buf_send);
+          MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap][i],0,world);
+        }
+      }
+
+      if (sendself[iswap]) {
+        avec->pack_reverse(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
+                           buf_send);
+        avec->unpack_reverse(sendnum[iswap][nsend],sendlist[iswap][nsend],
+                             buf_send);
+      }
+
+      if (sendother[iswap]) {
+        for (i = 0; i < nsend; i++) {
+          MPI_Waitany(nsend,requests,&irecv,&status);
+          avec->unpack_reverse(sendnum[iswap][irecv],sendlist[iswap][irecv],
+                               &buf_recv[reverse_recv_offset[iswap][irecv]]);
+        }
       }
     }
   }
@@ -298,7 +595,7 @@ void CommTiled::exchange()
 
 void CommTiled::borders()
 {
-  int i,n,irecv,ngroup,nlast,nsend,rmaxswap;
+  int i,m,n,irecv,nlocal,nlast,nsend,nrecv,ncount,rmaxswap;
   double xlo,xhi,ylo,yhi,zlo,zhi;
   double *bbox;
   double **x;
@@ -320,65 +617,69 @@ void CommTiled::borders()
     // for yz-dim swaps, check owned and ghost atoms
     // store sent atom indices in list for use in future timesteps
     // NOTE: assume SINGLE mode, add back in logic for MULTI mode later
+    //       and for ngroup when bordergroup is set
 
     x = atom->x;
 
-    for (i = 0; i < nsendproc[iswap]; i++) {
-      bbox = sendbox[iswap][i];
-      xlo = bbox[0]; xhi = bbox[1];
-      ylo = bbox[2]; yhi = bbox[3];
-      zlo = bbox[4]; zhi = bbox[5];
+    for (m = 0; m < nsendproc[iswap]; m++) {
+      bbox = sendbox[iswap][m];
+      xlo = bbox[0]; ylo = bbox[1]; zlo = bbox[2];
+      xhi = bbox[3]; yhi = bbox[4]; zhi = bbox[5];
 
-      ngroup = atom->nfirst;
+      nlocal = atom->nlocal;
       if (iswap < 2) nlast = atom->nlocal;
       else nlast = atom->nlocal + atom->nghost;
 
-      nsend = 0;
-      for (i = 0; i < ngroup; i++)
+      ncount = 0;
+      for (i = 0; i < nlocal; i++)
         if (x[i][0] >= xlo && x[i][0] <= xhi &&
             x[i][1] >= ylo && x[i][1] <= yhi &&
             x[i][2] >= zlo && x[i][2] <= zhi) {
-          if (nsend == maxsendlist[iswap][i]) grow_list(iswap,i,nsend);
-          sendlist[iswap][i][nsend++] = i;
+          if (ncount == maxsendlist[iswap][m]) grow_list(iswap,i,ncount);
+          sendlist[iswap][m][ncount++] = i;
         }
       for (i = atom->nlocal; i < nlast; i++)
         if (x[i][0] >= xlo && x[i][0] <= xhi &&
             x[i][1] >= ylo && x[i][1] <= yhi &&
             x[i][2] >= zlo && x[i][2] <= zhi) {
-          if (nsend == maxsendlist[iswap][i]) grow_list(iswap,i,nsend);
-          sendlist[iswap][i][nsend++] = i;
+          if (ncount == maxsendlist[iswap][m]) grow_list(iswap,i,ncount);
+          sendlist[iswap][m][ncount++] = i;
         }
-      sendnum[iswap][i] = nsend;
-      smax = MAX(smax,nsend);
+      sendnum[iswap][m] = ncount;
+      smax = MAX(smax,ncount);
     }
 
-    // send sendnum counts to procs who recv from me
+    // send sendnum counts to procs who recv from me except self
+    // copy data to self if sendself is set
 
-    if (sendproc[iswap][0] != me) {
-      for (i = 0; i < nrecvproc[iswap]; i++)
-        MPI_Irecv(&recvnum[iswap][i],1,MPI_INT,
-                  recvproc[iswap][i],0,world,&requests[i]);
-      for (i = 0; i < nsendproc[iswap]; i++)
-        MPI_Send(&sendnum[iswap][i],1,MPI_INT,sendproc[iswap][i],0,world);
-      MPI_Waitall(nrecvproc[iswap],requests,statuses);
+    nsend = nsendproc[iswap] - sendself[iswap];
+    nrecv = nrecvproc[iswap] - sendself[iswap];
 
-    } else recvnum[iswap][0] = sendnum[iswap][0];
+    if (sendother[iswap]) {
+      for (m = 0; m < nrecv; m++)
+        MPI_Irecv(&recvnum[iswap][m],1,MPI_INT,
+                  recvproc[iswap][m],0,world,&requests[m]);
+      for (m = 0; m < nsend; m++)
+        MPI_Send(&sendnum[iswap][m],1,MPI_INT,sendproc[iswap][m],0,world);
+    }
+    if (sendself[iswap]) recvnum[iswap][nrecv] = sendnum[iswap][nsend];
+    if (sendother[iswap]) MPI_Waitall(nrecv,requests,statuses);
 
     // setup other per swap/proc values from sendnum and recvnum
 
     rmaxswap = 0;
-    for (i = 0; i < nrecvproc[iswap]; i++) {
-      rmaxswap += recvnum[iswap][i];
-      size_forward_recv[iswap][i] = recvnum[iswap][i]*size_forward;
-      size_reverse_send[iswap][i] = recvnum[iswap][i]*size_reverse;
-      size_reverse_recv[iswap][i] = sendnum[iswap][i]*size_reverse;
-      if (i == 0) {
+    for (m = 0; m < nrecvproc[iswap]; m++) {
+      rmaxswap += recvnum[iswap][m];
+      size_forward_recv[iswap][m] = recvnum[iswap][m]*size_forward;
+      size_reverse_send[iswap][m] = recvnum[iswap][m]*size_reverse;
+      size_reverse_recv[iswap][m] = sendnum[iswap][m]*size_reverse;
+      if (m == 0) {
         firstrecv[iswap][0] = atom->nlocal + atom->nghost;
         forward_recv_offset[iswap][0] = 0;
       } else {
-        firstrecv[iswap][i] = firstrecv[iswap][i-1] + recvnum[iswap][i-1];
-        forward_recv_offset[iswap][i] = 
-          forward_recv_offset[iswap][i-1] + recvnum[iswap][i-1];
+        firstrecv[iswap][m] = firstrecv[iswap][m-1] + recvnum[iswap][m-1];
+        forward_recv_offset[iswap][m] = 
+          forward_recv_offset[iswap][m-1] + recvnum[iswap][m-1];
       }
     }
     rmax = MAX(rmax,rmaxswap);
@@ -390,54 +691,64 @@ void CommTiled::borders()
 
     // swap atoms with other procs using pack_border(), unpack_border()
 
-    if (sendproc[iswap][0] != me) {
-      for (i = 0; i < nsendproc[iswap]; i++) {
-        if (ghost_velocity) {
-          for (i = 0; i < nrecvproc[iswap]; i++)
-            MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
-                      recvnum[iswap][i]*size_border,
-                      MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-          for (i = 0; i < nsendproc[iswap]; i++) {
-            n = avec->pack_border_vel(sendnum[iswap][i],sendlist[iswap][i],
-                                      buf_send,pbc_flag[iswap][i],
-                                      pbc[iswap][i]);
-            MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
-                     sendproc[iswap][i],0,world);
-          }
-          for (i = 0; i < nrecvproc[iswap]; i++) {
-            MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
-            avec->unpack_border(recvnum[iswap][i],firstrecv[iswap][irecv],
-                                &buf_recv[forward_recv_offset[iswap][irecv]]);
-          }
-          
-        } else {
-          for (i = 0; i < nrecvproc[iswap]; i++)
-            MPI_Irecv(&buf_recv[forward_recv_offset[iswap][i]],
-                      recvnum[iswap][i]*size_border,
-                      MPI_DOUBLE,recvproc[iswap][i],0,world,&requests[i]);
-          for (i = 0; i < nsendproc[iswap]; i++) {
-            n = avec->pack_border(sendnum[iswap][i],sendlist[iswap][i],
-                                  buf_send,pbc_flag[iswap][i],pbc[iswap][i]);
-            MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
-                     sendproc[iswap][i],0,world);
-          }
-          for (i = 0; i < nrecvproc[iswap]; i++) {
-            MPI_Waitany(nrecvproc[iswap],requests,&irecv,&status);
-            avec->unpack_border(recvnum[iswap][i],firstrecv[iswap][irecv],
-                                &buf_recv[forward_recv_offset[iswap][irecv]]);
-          }
+    if (ghost_velocity) {
+      if (sendother[iswap]) {
+        for (m = 0; m < nrecv; m++)
+          MPI_Irecv(&buf_recv[forward_recv_offset[iswap][m]],
+                    recvnum[iswap][m]*size_border,
+                    MPI_DOUBLE,recvproc[iswap][m],0,world,&requests[m]);
+        for (m = 0; m < nsend; m++) {
+          n = avec->pack_border_vel(sendnum[iswap][m],sendlist[iswap][m],
+                                    buf_send,pbc_flag[iswap][m],pbc[iswap][m]);
+          MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
+                   sendproc[iswap][m],0,world);
+        }
+      }
+
+      if (sendself[iswap]) {
+        n = avec->pack_border_vel(sendnum[iswap][nsend],sendlist[iswap][nsend],
+                                  buf_send,pbc_flag[iswap][nsend],
+                                  pbc[iswap][nsend]);
+        avec->unpack_border_vel(recvnum[iswap][nrecv],firstrecv[iswap][nrecv],
+                                buf_send);
+      }
+
+      if (sendother[iswap]) {
+        for (m = 0; m < nrecv; m++) {
+          MPI_Waitany(nrecv,requests,&irecv,&status);
+          avec->unpack_border(recvnum[iswap][irecv],firstrecv[iswap][irecv],
+                              &buf_recv[forward_recv_offset[iswap][irecv]]);
         }
       }
 
     } else {
-      if (ghost_velocity) {
-        n = avec->pack_border_vel(sendnum[iswap][0],sendlist[iswap][0],
-                                  buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
-        avec->unpack_border_vel(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
-      } else {
-        n = avec->pack_border(sendnum[iswap][0],sendlist[iswap][0],
-                              buf_send,pbc_flag[iswap][0],pbc[iswap][0]);
-        avec->unpack_border(recvnum[iswap][0],firstrecv[iswap][0],buf_send);
+      if (sendother[iswap]) {
+        for (m = 0; m < nrecv; m++)
+          MPI_Irecv(&buf_recv[forward_recv_offset[iswap][m]],
+                    recvnum[iswap][m]*size_border,
+                    MPI_DOUBLE,recvproc[iswap][m],0,world,&requests[m]);
+        for (m = 0; m < nsend; m++) {
+          n = avec->pack_border(sendnum[iswap][m],sendlist[iswap][m],
+                                buf_send,pbc_flag[iswap][m],pbc[iswap][m]);
+          MPI_Send(buf_send,n*size_border,MPI_DOUBLE,
+                   sendproc[iswap][m],0,world);
+        }
+      }
+
+      if (sendself[iswap]) {
+        n = avec->pack_border(sendnum[iswap][nsend],sendlist[iswap][nsend],
+                              buf_send,pbc_flag[iswap][nsend],
+                              pbc[iswap][nsend]);
+        avec->unpack_border(recvnum[iswap][nsend],firstrecv[iswap][nsend],
+                            buf_send);
+      }
+
+      if (sendother[iswap]) {
+        for (m = 0; m < nrecv; m++) {
+          MPI_Waitany(nrecv,requests,&irecv,&status);
+          avec->unpack_border(recvnum[iswap][irecv],firstrecv[iswap][irecv],
+                              &buf_recv[forward_recv_offset[iswap][irecv]]);
+        }
       }
     }
 
@@ -459,6 +770,8 @@ void CommTiled::borders()
   if (map_style) atom->map_set();
 }
 
+// NOTE: remaining forward/reverse methods still need to be updated
+
 /* ----------------------------------------------------------------------
    forward communication invoked by a Pair
    n = constant number of datums per atom
@@ -786,6 +1099,186 @@ int CommTiled::exchange_variable(int n, double *inbuf, double *&outbuf)
   return nrecv;
 }
 
+/* ----------------------------------------------------------------------
+   determine overlap list of Noverlap procs the lo/hi box overlaps
+   overlap = non-zero area in common between box and proc sub-domain
+   box is owned by me and extends in dim
+------------------------------------------------------------------------- */
+
+void CommTiled::box_drop_brick(int idim, double *lo, double *hi, int &indexme)
+{
+  // NOTE: this is not triclinic compatible
+
+  int index,dir;
+  if (hi[idim] == sublo[idim]) {
+    index = myloc[idim] - 1;
+    dir = -1;
+  } else if (lo[idim] == subhi[idim]) {
+    index = myloc[idim] + 1;
+    dir = 1;
+  } else if (hi[idim] == boxhi[idim]) {
+    index = procgrid[idim] - 1;
+    dir = -1;
+  } else if (lo[idim] == boxlo[idim]) {
+    index = 0;
+    dir = 1;
+  }
+
+  int other1,other2,proc;
+  double lower,upper;
+  double *split;
+
+  if (idim == 0) {
+    other1 = myloc[1]; other2 = myloc[2];
+    split = xsplit;
+  } else if (idim == 1) {
+    other1 = myloc[0]; other2 = myloc[2];
+    split = ysplit;
+  } else {
+    other1 = myloc[0]; other2 = myloc[1];
+    split = zsplit;
+  }
+
+  while (1) {
+    lower = boxlo[idim] + prd[idim]*split[index];
+    if (index < procgrid[idim]-1) 
+      upper = boxlo[idim] + prd[idim]*split[index+1];
+    else upper = boxhi[idim];
+    if (lower >= hi[idim] || upper <= lo[idim]) break;
+
+    if (idim == 0) proc = grid2proc[index][other1][other2];
+    else if (idim == 1) proc = grid2proc[other1][index][other2];
+    else proc = grid2proc[other1][other2][idim];
+
+    if (noverlap == maxoverlap) {
+      maxoverlap += DELTA_PROCS;
+      memory->grow(overlap,maxoverlap,"comm:overlap");
+    }
+
+    if (proc == me) indexme = noverlap;
+    overlap[noverlap++] = proc;
+    index += dir;
+    if (index < 0 || index >= procgrid[idim]) break;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   determine overlap list of Noverlap procs the lo/hi box overlaps
+   overlap = non-zero area in common between box and proc sub-domain
+   recursive method for traversing an RCB tree of cuts
+   no need to split lo/hi box as recurse b/c OK if box extends outside RCB box
+------------------------------------------------------------------------- */
+
+void CommTiled::box_drop_tiled(int idim, double *lo, double *hi, int &indexme)
+{
+  box_drop_tiled_recurse(lo,hi,0,nprocs-1,indexme);
+}
+
+void CommTiled::box_drop_tiled_recurse(double *lo, double *hi, 
+                                       int proclower, int procupper,
+                                       int &indexme)
+{
+  // end recursion when partition is a single proc
+  // add proc to overlap list
+
+  if (proclower == procupper) {
+    if (noverlap == maxoverlap) {
+      maxoverlap += DELTA_PROCS;
+      memory->grow(overlap,maxoverlap,"comm:overlap");
+    }
+
+    if (proclower == me) indexme = noverlap;
+    overlap[noverlap++] = proclower;
+    return;
+  }
+
+  // drop box on each side of cut it extends beyond
+  // use > and < criteria so does not include a box it only touches
+  // procmid = 1st processor in upper half of partition
+  //         = location in tree that stores this cut
+  // dim = 0,1,2 dimension of cut
+  // cut = position of cut
+
+  int procmid = proclower + (procupper - proclower) / 2 + 1;
+  double cut = rcbinfo[procmid].cut;
+  int idim = rcbinfo[procmid].dim;
+  
+  if (lo[idim] < cut) 
+    box_drop_tiled_recurse(lo,hi,proclower,procmid-1,indexme);
+  if (hi[idim] > cut)
+    box_drop_tiled_recurse(lo,hi,procmid,procupper,indexme);
+}
+
+/* ----------------------------------------------------------------------
+   return other box owned by proc as lo/hi corner pts
+------------------------------------------------------------------------- */
+
+void CommTiled::box_other_brick(int idim, int iswap,
+                                int proc, double *lo, double *hi)
+{
+  lo[0] = sublo[0]; lo[1] = sublo[1]; lo[2] = sublo[2]; 
+  hi[0] = subhi[0]; hi[1] = subhi[1]; hi[2] = subhi[2]; 
+
+  int other1,other2,oproc;
+  double *split;
+
+  if (idim == 0) {
+    other1 = myloc[1]; other2 = myloc[2];
+    split = xsplit;
+  } else if (idim == 1) {
+    other1 = myloc[0]; other2 = myloc[2];
+    split = ysplit;
+  } else {
+    other1 = myloc[0]; other2 = myloc[1];
+    split = zsplit;
+  }
+
+  int dir = -1;
+  if (iswap) dir = 1;
+  int index = myloc[idim];
+  int n = procgrid[idim];
+
+  for (int i = 0; i < n; i++) {
+    index += dir;
+    if (index < 0) index = n-1;
+    else if (index >= n) index = 0;
+
+    if (idim == 0) oproc = grid2proc[index][other1][other2];
+    else if (idim == 1) oproc = grid2proc[other1][index][other2];
+    else oproc = grid2proc[other1][other2][idim];
+
+    if (proc == oproc) {
+      lo[idim] = boxlo[idim] + prd[idim]*split[index];
+      if (split[index+1] < 1.0) 
+        hi[idim] = boxlo[idim] + prd[idim]*split[index+1];
+      else hi[idim] = boxhi[idim];
+      return;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   return other box owned by proc as lo/hi corner pts
+------------------------------------------------------------------------- */
+
+void CommTiled::box_other_tiled(int idim, int iswap,
+                                int proc, double *lo, double *hi)
+{
+  double (*split)[2] = rcbinfo[proc].mysplit;
+
+  lo[0] = boxlo[0] + prd[0]*split[0][0];
+  if (split[0][1] < 1.0) hi[0] = boxlo[0] + prd[0]*split[0][1];
+  else hi[0] = boxhi[0];
+
+  lo[1] = boxlo[1] + prd[1]*split[1][0];
+  if (split[1][1] < 1.0) hi[1] = boxlo[1] + prd[1]*split[1][1];
+  else hi[1] = boxhi[1];
+
+  lo[2] = boxlo[2] + prd[2]*split[2][0];
+  if (split[2][1] < 1.0) hi[2] = boxlo[2] + prd[2]*split[2][1];
+  else hi[2] = boxhi[2];
+}
+
 /* ----------------------------------------------------------------------
    realloc the size of the send buffer as needed with BUFFACTOR and bufextra
    if flag = 1, realloc
@@ -822,7 +1315,178 @@ void CommTiled::grow_list(int iswap, int iwhich, int n)
 {
   maxsendlist[iswap][iwhich] = static_cast<int> (BUFFACTOR * n);
   memory->grow(sendlist[iswap][iwhich],maxsendlist[iswap][iwhich],
-               "comm:sendlist[iswap]");
+               "comm:sendlist[i][j]");
+}
+
+/* ----------------------------------------------------------------------
+   allocation of swap info
+------------------------------------------------------------------------- */
+
+void CommTiled::allocate_swap(int n)
+{
+  nsendproc = new int[n];
+  nrecvproc = new int[n];
+  sendother = new int[n];
+  sendself = new int[n];
+  nprocmax = new int[n];
+
+  sendproc = new int*[n];
+  recvproc = new int*[n];
+  sendnum = new int*[n];
+  recvnum = new int*[n];
+  size_forward_recv = new int*[n];
+  firstrecv = new int*[n];
+  size_reverse_send = new int*[n];
+  size_reverse_recv = new int*[n];
+  forward_recv_offset = new int*[n];
+  reverse_recv_offset = new int*[n];
+
+  pbc_flag = new int*[n];
+  pbc = new int**[n];
+  sendbox = new double**[n];
+  maxsendlist = new int*[n];
+  sendlist = new int**[n];
+
+  for (int i = 0; i < n; i++) {
+    sendproc[i] = recvproc[i] = NULL;
+    sendnum[i] = recvnum[i] = NULL;
+    size_forward_recv[i] = firstrecv[i] = NULL;
+    size_reverse_send[i] = size_reverse_recv[i] = NULL;
+    forward_recv_offset[i] = reverse_recv_offset[i] = NULL;
+
+    pbc_flag[i] = NULL;
+    pbc[i] = NULL;
+    sendbox[i] = NULL;
+    maxsendlist[i] = NULL;
+    sendlist[i] = NULL;
+  }
+
+  maxreqstat = 0;
+  requests = NULL;
+  statuses = NULL;
+
+  for (int i = 0; i < n; i++) {
+    nprocmax[i] = DELTA_PROCS;
+    grow_swap_send(i,DELTA_PROCS,0);
+    grow_swap_recv(i,DELTA_PROCS);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   grow info for swap I, to allow for N procs to communicate with
+   ditto for complementary recv for swap I+1 or I-1, as invoked by caller
+------------------------------------------------------------------------- */
+
+void CommTiled::grow_swap_send(int i, int n, int nold)
+{
+  delete [] sendproc[i];
+  sendproc[i] = new int[n];
+  delete [] sendnum[i];
+  sendnum[i] = new int[n];
+
+  delete [] size_reverse_recv[i];
+  size_reverse_recv[i] = new int[n];
+  delete [] reverse_recv_offset[i];
+  reverse_recv_offset[i] = new int[n];
+
+  delete [] pbc_flag[i];
+  pbc_flag[i] = new int[n];
+  memory->destroy(pbc[i]);
+  memory->create(pbc[i],n,6,"comm:pbc_flag");
+  memory->destroy(sendbox[i]);
+  memory->create(sendbox[i],n,6,"comm:sendbox");
+
+  delete [] maxsendlist[i];
+  maxsendlist[i] = new int[n];
+
+  for (int j = 0; j < nold; j++) memory->destroy(sendlist[i][j]);
+  delete [] sendlist[i];
+  sendlist[i] = new int*[n];
+  for (int j = 0; j < n; j++) {
+    maxsendlist[i][j] = BUFMIN;
+    memory->create(sendlist[i][j],BUFMIN,"comm:sendlist[i][j]");
+  }
+}
+
+void CommTiled::grow_swap_recv(int i, int n)
+{
+  delete [] recvproc[i];
+  recvproc[i] = new int[n];
+  delete [] recvnum[i];
+  recvnum[i] = new int[n];
+
+  delete [] size_forward_recv[i];
+  size_forward_recv[i] = new int[n];
+  delete [] firstrecv[i];
+  firstrecv[i] = new int[n];
+  delete [] forward_recv_offset[i];
+  forward_recv_offset[i] = new int[n];
+
+  delete [] size_reverse_send[i];
+  size_reverse_send[i] = new int[n];
+
+  if (n > maxreqstat) {
+    maxreqstat = n;
+    delete [] requests;
+    delete [] statuses;
+    requests = new MPI_Request[n];
+    statuses = new MPI_Status[n];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   deallocate swap info
+------------------------------------------------------------------------- */
+
+void CommTiled::deallocate_swap(int n)
+{
+  delete [] nsendproc;
+  delete [] nrecvproc;
+  delete [] sendother;
+  delete [] sendself;
+
+  for (int i = 0; i < n; i++) {
+    delete [] sendproc[i];
+    delete [] recvproc[i];
+    delete [] sendnum[i];
+    delete [] recvnum[i];
+    delete [] size_forward_recv[i];
+    delete [] firstrecv[i];
+    delete [] size_reverse_send[i];
+    delete [] size_reverse_recv[i];
+    delete [] forward_recv_offset[i];
+    delete [] reverse_recv_offset[i];
+
+    delete [] pbc_flag[i];
+    memory->destroy(pbc[i]);
+    memory->destroy(sendbox[i]);
+    delete [] maxsendlist[i];
+
+    for (int j = 0; j < nprocmax[i]; j++) memory->destroy(sendlist[i][j]);
+    delete [] sendlist[i];
+  }
+
+  delete [] sendproc;
+  delete [] recvproc;
+  delete [] sendnum;
+  delete [] recvnum;
+  delete [] size_forward_recv;
+  delete [] firstrecv;
+  delete [] size_reverse_send;
+  delete [] size_reverse_recv;
+  delete [] forward_recv_offset;
+  delete [] reverse_recv_offset;
+
+  delete [] pbc_flag;
+  delete [] pbc;
+  delete [] sendbox;
+  delete [] maxsendlist;
+  delete [] sendlist;
+
+  delete [] requests;
+  delete [] statuses;
+
+  delete [] nprocmax;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/comm_tiled.h b/src/comm_tiled.h
index 2c7a6e62df..fd27c4de41 100644
--- a/src/comm_tiled.h
+++ b/src/comm_tiled.h
@@ -21,6 +21,7 @@ namespace LAMMPS_NS {
 class CommTiled : public Comm {
  public:
   CommTiled(class LAMMPS *);
+  CommTiled(class LAMMPS *, class Comm *);
   virtual ~CommTiled();
 
   void init();
@@ -46,22 +47,23 @@ class CommTiled : public Comm {
   bigint memory_usage();
 
  private:
-  int nswap;                    // # of swaps to perform = 2*dim
-  int *nsendproc,*nrecvproc;    // # of procs to send/recv to/from in each swap
-
   int triclinic;                    // 0 if domain is orthog, 1 if triclinic
   int map_style;                    // non-0 if global->local mapping is done
   int size_forward;                 // # of per-atom datums in forward comm
   int size_reverse;                 // # of datums in reverse comm
   int size_border;                  // # of datums in forward border comm
 
-  int **sendnum,**recvnum;      // # of atoms to send/recv per swap/proc
+  int nswap;                    // # of swaps to perform = 2*dim
+  int *nsendproc,*nrecvproc;    // # of procs to send/recv to/from in each swap
+  int *sendother;               // 1 if send to any other proc in each swap
+  int *sendself;                // 1 if send to self in each swap
+  int *nprocmax;                // current max # of send procs for each swap
   int **sendproc,**recvproc;    // proc to send/recv to/from per swap/proc
+  int **sendnum,**recvnum;      // # of atoms to send/recv per swap/proc
   int **size_forward_recv;      // # of values to recv in each forward swap/proc
   int **firstrecv;              // where to put 1st recv atom per swap/proc
   int **size_reverse_send;      // # to send in each reverse comm per swap/proc
   int **size_reverse_recv;      // # to recv in each reverse comm per swap/proc
-
   int **forward_recv_offset;  // forward comm offsets in buf_recv per swap/proc
   int **reverse_recv_offset;  // reverse comm offsets in buf_recv per swap/proc
 
@@ -77,16 +79,53 @@ class CommTiled : public Comm {
   int maxsend,maxrecv;          // current size of send/recv buffer
   int maxforward,maxreverse;    // max # of datums in forward/reverse comm
 
+  int maxexchange;              // max # of datums/atom in exchange comm 
   int bufextra;                 // extra space beyond maxsend in send buffer
 
+  int maxreqstat;               // max size of Request and Status vectors
   MPI_Request *requests;
   MPI_Status *statuses;
 
   int comm_x_only,comm_f_only;      // 1 if only exchange x,f in for/rev comm
 
-  void grow_send(int, int);         // reallocate send buffer
-  void grow_recv(int);              // free/allocate recv buffer
-  void grow_list(int, int, int);    // reallocate sendlist for one swap/proc
+  struct RCBinfo {
+    double mysplit[3][2];      // fractional RCB bounding box for one proc
+    double cut;        	       // position of cut this proc owns
+    int dim;	               // dimension = 0/1/2 of cut
+  };
+
+  int noverlap;                // # of overlapping procs
+  int maxoverlap;              // current max length of overlap
+  int *overlap;                // list of overlapping procs
+
+  RCBinfo *rcbinfo;            // list of RCB info for all procs
+
+  double *prd;                 // local ptrs to Domain attributes
+  double *boxlo,*boxhi;
+  double *sublo,*subhi;
+
+  void init_buffers();
+
+  // box drop and other functions
+
+  typedef void (CommTiled::*BoxDropPtr)(int, double *, double *, int &);
+  BoxDropPtr box_drop;
+  void box_drop_brick(int, double *, double *, int &);
+  void box_drop_tiled(int, double *, double *, int &);
+  void box_drop_tiled_recurse(double *, double *, int, int, int &);
+
+  typedef void (CommTiled::*BoxOtherPtr)(int, int, int, double *, double *);
+  BoxOtherPtr box_other;
+  void box_other_brick(int, int, int, double *, double *);
+  void box_other_tiled(int, int, int, double *, double *);
+
+  void grow_send(int, int);            // reallocate send buffer
+  void grow_recv(int);                 // free/allocate recv buffer
+  void grow_list(int, int, int);       // reallocate sendlist for one swap/proc
+  void allocate_swap(int);             // allocate swap arrays
+  void grow_swap_send(int, int, int);  // grow swap arrays for send and recv
+  void grow_swap_recv(int, int);
+  void deallocate_swap(int);           // deallocate swap arrays
 };
 
 }
diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp
index be5c479360..9777f23334 100644
--- a/src/create_atoms.cpp
+++ b/src/create_atoms.cpp
@@ -27,6 +27,8 @@
 #include "domain.h"
 #include "lattice.h"
 #include "region.h"
+#include "input.h"
+#include "variable.h"
 #include "random_park.h"
 #include "random_mars.h"
 #include "math_extra.h"
@@ -41,6 +43,7 @@ using namespace MathConst;
 
 enum{BOX,REGION,SINGLE,RANDOM};
 enum{ATOM,MOLECULE};
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
@@ -103,6 +106,8 @@ void CreateAtoms::command(int narg, char **arg)
   remapflag = 0;
   mode = ATOM;
   int molseed;
+  varflag = 0;
+  vstr = xstr = ystr = zstr = NULL;
 
   nbasis = domain->lattice->nbasis;
   basistype = new int[nbasis];
@@ -141,6 +146,33 @@ void CreateAtoms::command(int narg, char **arg)
       else if (strcmp(arg[iarg+1],"lattice") == 0) scaleflag = 1;
       else error->all(FLERR,"Illegal create_atoms command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"var") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal create_atoms command");
+      delete [] vstr;
+      int n = strlen(arg[iarg+1]) + 1;
+      vstr = new char[n];
+      strcpy(vstr,arg[iarg+1]);
+      varflag = 1;
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"set") == 0) {
+      if (iarg+3 > narg) error->all(FLERR,"Illegal create_atoms command");
+      if (strcmp(arg[iarg+1],"x") == 0) {
+        delete [] xstr;
+        int n = strlen(arg[iarg+2]) + 1;
+        xstr = new char[n];
+        strcpy(xstr,arg[iarg+2]);
+      } else if (strcmp(arg[iarg+1],"y") == 0) {
+        delete [] ystr;
+        int n = strlen(arg[iarg+2]) + 1;
+        ystr = new char[n];
+        strcpy(ystr,arg[iarg+2]);
+      } else if (strcmp(arg[iarg+1],"z") == 0) {
+        delete [] zstr;
+        int n = strlen(arg[iarg+2]) + 1;
+        zstr = new char[n];
+        strcpy(zstr,arg[iarg+2]);
+      } else error->all(FLERR,"Illegal create_atoms command");
+      iarg += 3;
     } else error->all(FLERR,"Illegal create_atoms command");
   }
 
@@ -178,6 +210,47 @@ void CreateAtoms::command(int narg, char **arg)
     ranmol = new RanMars(lmp,molseed+comm->me);
   }
 
+  // error check and further setup for variable test
+  // save local copy of each equal variable string so can restore at end
+
+  if (!vstr && (xstr || ystr || zstr))
+    error->all(FLERR,"Incomplete use of variables in create_atoms command");
+  if (vstr && (!xstr && !ystr && !zstr))
+    error->all(FLERR,"Incomplete use of variables in create_atoms command");
+
+  if (varflag) {
+    vvar = input->variable->find(vstr);
+    if (vvar < 0)
+      error->all(FLERR,"Variable name for create_atoms does not exist");
+    if (!input->variable->equalstyle(vvar))
+      error->all(FLERR,"Variable for create_atoms is invalid style");
+
+    if (xstr) {
+      xvar = input->variable->find(xstr);
+      if (xvar < 0)
+        error->all(FLERR,"Variable name for create_atoms does not exist");
+      if (!input->variable->equalstyle(xvar))
+        error->all(FLERR,"Variable for create_atoms is invalid style");
+      input->variable->equal_save(xvar,xstr_copy);
+    }
+    if (ystr) {
+      yvar = input->variable->find(ystr);
+      if (yvar < 0)
+        error->all(FLERR,"Variable name for create_atoms does not exist");
+      if (!input->variable->equalstyle(yvar))
+        error->all(FLERR,"Variable for create_atoms is invalid style");
+      input->variable->equal_save(yvar,ystr_copy);
+    }
+    if (zstr) {
+      zvar = input->variable->find(zstr);
+      if (zvar < 0)
+        error->all(FLERR,"Variable name for create_atoms does not exist");
+      if (!input->variable->equalstyle(zvar))
+        error->all(FLERR,"Variable for create_atoms is invalid style");
+      input->variable->equal_save(zvar,zstr_copy);
+    }
+  }
+
   // demand non-none lattice be defined for BOX and REGION
   // else setup scaling for SINGLE and RANDOM
   // could use domain->lattice->lattice2box() to do conversion of
@@ -226,17 +299,32 @@ void CreateAtoms::command(int narg, char **arg)
   }
 
   if (style == BOX || style == REGION) {
-    if (domain->xperiodic) {
-      if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
-      if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] -= 2.0*epsilon[0];
-    }
-    if (domain->yperiodic) {
-      if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
-      if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] -= 2.0*epsilon[1];
-    }
-    if (domain->zperiodic) {
-      if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
-      if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] -= 2.0*epsilon[2];
+    if (comm->layout != LAYOUT_TILED) {
+      if (domain->xperiodic) {
+        if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
+        if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] -= 2.0*epsilon[0];
+      }
+      if (domain->yperiodic) {
+        if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
+        if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] -= 2.0*epsilon[1];
+      }
+      if (domain->zperiodic) {
+        if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
+        if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] -= 2.0*epsilon[2];
+      }
+    } else {
+      if (domain->xperiodic) {
+        if (comm->mysplit[0][0] == 0.0) sublo[0] -= epsilon[0];
+        if (comm->mysplit[0][1] == 1.0) subhi[0] -= 2.0*epsilon[0];
+      }
+      if (domain->yperiodic) {
+        if (comm->mysplit[1][0] == 0.0) sublo[1] -= epsilon[1];
+        if (comm->mysplit[1][1] == 1.0) subhi[1] -= 2.0*epsilon[1];
+      }
+      if (domain->zperiodic) {
+        if (comm->mysplit[2][0] == 0.0) sublo[2] -= epsilon[2];
+        if (comm->mysplit[2][1] == 1.0) subhi[2] -= 2.0*epsilon[2];
+      }
     }
   }
 
@@ -259,6 +347,14 @@ void CreateAtoms::command(int narg, char **arg)
         fix->set_arrays(i);
   }
 
+  // restore each equal variable string previously saved
+
+  if (varflag) {
+    if (xstr) input->variable->equal_restore(xvar,xstr_copy);
+    if (ystr) input->variable->equal_restore(yvar,ystr_copy);
+    if (zstr) input->variable->equal_restore(zvar,zstr_copy);
+  }
+
   // set new total # of atoms and error check
 
   bigint nblocal = atom->nlocal;
@@ -409,6 +505,10 @@ void CreateAtoms::command(int narg, char **arg)
 
   delete ranmol;
   if (domain->lattice) delete [] basistype;
+  delete [] vstr;
+  delete [] xstr;
+  delete [] ystr;
+  delete [] zstr;
 
   // print status
 
@@ -509,7 +609,7 @@ void CreateAtoms::add_random()
   }
 
   // generate random positions for each new atom/molecule within bounding box
-  // iterate until atom is within region and triclinic simulation box
+  // iterate until atom is within region, variable, and triclinic simulation box
   // if final atom position is in my subbox, create it
 
   if (xlo > xhi || ylo > yhi || zlo > zhi)
@@ -527,6 +627,7 @@ void CreateAtoms::add_random()
       if (nregion >= 0 &&
           domain->regions[nregion]->match(xone[0],xone[1],xone[2]) == 0)
         valid = 0;
+      if (varflag && vartest(xone) == 0) valid = 0;
       if (triclinic) {
         domain->x2lamda(xone,lamda);
         coord = lamda;
@@ -642,6 +743,10 @@ void CreateAtoms::add_lattice()
           if (style == REGION)
             if (!domain->regions[nregion]->match(x[0],x[1],x[2])) continue;
 
+          // if variable test specified, eval variable
+
+          if (varflag && vartest(x) == 0) continue;
+
           // test if atom/molecule position is in my subbox
 
           if (triclinic) {
@@ -696,3 +801,19 @@ void CreateAtoms::add_molecule(double *center)
     atom->add_molecule_atom(onemol,m,n,0);
   }
 }
+
+/* ----------------------------------------------------------------------
+   test a generated atom position against variable evaluation
+   first plug in x,y,z values as requested
+------------------------------------------------------------------------- */
+
+int CreateAtoms::vartest(double *x)
+{
+  if (xstr) input->variable->equal_override(xvar,x[0]);
+  if (ystr) input->variable->equal_override(yvar,x[1]);
+  if (zstr) input->variable->equal_override(zvar,x[2]);
+
+  double value = input->variable->compute_equal(vvar);
+  if (value == 0.0) return 0;
+  return 1;
+}
diff --git a/src/create_atoms.h b/src/create_atoms.h
index 26d3f475ef..a75a29d484 100644
--- a/src/create_atoms.h
+++ b/src/create_atoms.h
@@ -35,6 +35,10 @@ class CreateAtoms : protected Pointers {
   double xone[3];
   int remapflag;
 
+  int varflag,vvar,xvar,yvar,zvar;
+  char *vstr,*xstr,*ystr,*zstr;
+  char *xstr_copy,*ystr_copy,*zstr_copy;
+
   class Molecule *onemol;
   class RanMars *ranmol;
 
@@ -45,6 +49,7 @@ class CreateAtoms : protected Pointers {
   void add_random();
   void add_lattice();
   void add_molecule(double *);
+  int vartest(double *);        // evaluate a variable with new atom position
 };
 
 }
diff --git a/src/domain.cpp b/src/domain.cpp
index 5ffa35a82a..6f3344bb02 100644
--- a/src/domain.cpp
+++ b/src/domain.cpp
@@ -44,14 +44,15 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
+enum{NO_REMAP,X_REMAP,V_REMAP};    // same as fix_deform.cpp
+enum{IGNORE,WARN,ERROR};           // same as thermo.cpp
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
+
 #define BIG   1.0e20
 #define SMALL 1.0e-4
 #define DELTAREGION 4
 #define BONDSTRETCH 1.1
 
-enum{NO_REMAP,X_REMAP,V_REMAP};    // same as fix_deform.cpp
-enum{IGNORE,WARN,ERROR};           // same as thermo.cpp
-
 /* ----------------------------------------------------------------------
    default is periodic
 ------------------------------------------------------------------------- */
@@ -249,55 +250,81 @@ void Domain::set_global_box()
 /* ----------------------------------------------------------------------
    set lamda box params
    assumes global box is defined and proc assignment has been made
-   uses comm->xyz_split to define subbox boundaries in consistent manner
+   uses comm->xyz_split or comm->mysplit
+     to define subbox boundaries in consistent manner
 ------------------------------------------------------------------------- */
 
 void Domain::set_lamda_box()
 {
-  int *myloc = comm->myloc;
-  double *xsplit = comm->xsplit;
-  double *ysplit = comm->ysplit;
-  double *zsplit = comm->zsplit;
+  if (comm->layout != LAYOUT_TILED) {
+    int *myloc = comm->myloc;
+    double *xsplit = comm->xsplit;
+    double *ysplit = comm->ysplit;
+    double *zsplit = comm->zsplit;
 
-  sublo_lamda[0] = xsplit[myloc[0]];
-  subhi_lamda[0] = xsplit[myloc[0]+1];
+    sublo_lamda[0] = xsplit[myloc[0]];
+    subhi_lamda[0] = xsplit[myloc[0]+1];
+    sublo_lamda[1] = ysplit[myloc[1]];
+    subhi_lamda[1] = ysplit[myloc[1]+1];
+    sublo_lamda[2] = zsplit[myloc[2]];
+    subhi_lamda[2] = zsplit[myloc[2]+1];
 
-  sublo_lamda[1] = ysplit[myloc[1]];
-  subhi_lamda[1] = ysplit[myloc[1]+1];
+  } else {
+    double (*mysplit)[2] = comm->mysplit;
 
-  sublo_lamda[2] = zsplit[myloc[2]];
-  subhi_lamda[2] = zsplit[myloc[2]+1];
+    sublo_lamda[0] = mysplit[0][0];
+    subhi_lamda[0] = mysplit[0][1];
+    sublo_lamda[1] = mysplit[1][0];
+    subhi_lamda[1] = mysplit[1][1];
+    sublo_lamda[2] = mysplit[2][0];
+    subhi_lamda[2] = mysplit[2][1];
+  }
 }
 
 /* ----------------------------------------------------------------------
    set local subbox params for orthogonal boxes
    assumes global box is defined and proc assignment has been made
-   uses comm->xyz_split to define subbox boundaries in consistent manner
+   uses comm->xyz_split or comm->mysplit
+     to define subbox boundaries in consistent manner
    insure subhi[max] = boxhi
 ------------------------------------------------------------------------- */
 
 void Domain::set_local_box()
 {
-  int *myloc = comm->myloc;
-  int *procgrid = comm->procgrid;
-  double *xsplit = comm->xsplit;
-  double *ysplit = comm->ysplit;
-  double *zsplit = comm->zsplit;
+  if (triclinic) return;
+      
+  if (comm->layout != LAYOUT_TILED) {
+    int *myloc = comm->myloc;
+    int *procgrid = comm->procgrid;
+    double *xsplit = comm->xsplit;
+    double *ysplit = comm->ysplit;
+    double *zsplit = comm->zsplit;
 
-  if (triclinic == 0) {
     sublo[0] = boxlo[0] + xprd*xsplit[myloc[0]];
-    if (myloc[0] < procgrid[0]-1)
-      subhi[0] = boxlo[0] + xprd*xsplit[myloc[0]+1];
+    if (myloc[0] < procgrid[0]-1) subhi[0] = boxlo[0] + xprd*xsplit[myloc[0]+1];
     else subhi[0] = boxhi[0];
 
     sublo[1] = boxlo[1] + yprd*ysplit[myloc[1]];
-    if (myloc[1] < procgrid[1]-1)
-      subhi[1] = boxlo[1] + yprd*ysplit[myloc[1]+1];
+    if (myloc[1] < procgrid[1]-1) subhi[1] = boxlo[1] + yprd*ysplit[myloc[1]+1];
     else subhi[1] = boxhi[1];
 
     sublo[2] = boxlo[2] + zprd*zsplit[myloc[2]];
-    if (myloc[2] < procgrid[2]-1)
-      subhi[2] = boxlo[2] + zprd*zsplit[myloc[2]+1];
+    if (myloc[2] < procgrid[2]-1) subhi[2] = boxlo[2] + zprd*zsplit[myloc[2]+1];
+    else subhi[2] = boxhi[2];
+
+  } else {
+    double (*mysplit)[2] = comm->mysplit;
+
+    sublo[0] = boxlo[0] + xprd*mysplit[0][0];
+    if (mysplit[0][1] < 1.0) subhi[0] = boxlo[0] + xprd*mysplit[0][1];
+    else subhi[0] = boxhi[0];
+
+    sublo[1] = boxlo[1] + yprd*mysplit[1][0];
+    if (mysplit[1][1] < 1.0) subhi[1] = boxlo[1] + yprd*mysplit[1][1];
+    else subhi[1] = boxhi[1];
+
+    sublo[2] = boxlo[2] + zprd*mysplit[2][0];
+    if (mysplit[2][1] < 1.0) subhi[2] = boxlo[2] + zprd*mysplit[2][1];
     else subhi[2] = boxhi[2];
   }
 }
diff --git a/src/dump_custom.cpp b/src/dump_custom.cpp
index f1cb9379fa..56a38e2402 100644
--- a/src/dump_custom.cpp
+++ b/src/dump_custom.cpp
@@ -34,7 +34,7 @@ using namespace LAMMPS_NS;
 // customize by adding keyword
 // also customize compute_atom_property.cpp
 
-enum{ID,MOL,PROC,TYPE,ELEMENT,MASS,
+enum{ID,MOL,PROC,PROCP1,TYPE,ELEMENT,MASS,
      X,Y,Z,XS,YS,ZS,XSTRI,YSTRI,ZSTRI,XU,YU,ZU,XUTRI,YUTRI,ZUTRI,
      XSU,YSU,ZSU,XSUTRI,YSUTRI,ZSUTRI,
      IX,IY,IZ,
@@ -470,6 +470,10 @@ int DumpCustom::count()
         for (i = 0; i < nlocal; i++) dchoose[i] = me;
         ptr = dchoose;
         nstride = 1;
+      } else if (thresh_array[ithresh] == PROCP1) {
+        for (i = 0; i < nlocal; i++) dchoose[i] = me;
+        ptr = dchoose;
+        nstride = 1;
       } else if (thresh_array[ithresh] == TYPE) {
         int *type = atom->type;
         for (i = 0; i < nlocal; i++) dchoose[i] = type[i];
@@ -1018,6 +1022,9 @@ int DumpCustom::parse_fields(int narg, char **arg)
     } else if (strcmp(arg[iarg],"proc") == 0) {
       pack_choice[i] = &DumpCustom::pack_proc;
       vtype[i] = INT;
+    } else if (strcmp(arg[iarg],"procp1") == 0) {
+      pack_choice[i] = &DumpCustom::pack_procp1;
+      vtype[i] = INT;
     } else if (strcmp(arg[iarg],"type") == 0) {
       pack_choice[i] = &DumpCustom::pack_type;
       vtype[i] = INT;
@@ -1497,6 +1504,7 @@ int DumpCustom::modify_param(int narg, char **arg)
     if (strcmp(arg[1],"id") == 0) thresh_array[nthresh] = ID;
     else if (strcmp(arg[1],"mol") == 0) thresh_array[nthresh] = MOL;
     else if (strcmp(arg[1],"proc") == 0) thresh_array[nthresh] = PROC;
+    else if (strcmp(arg[1],"procp1") == 0) thresh_array[nthresh] = PROCP1;
     else if (strcmp(arg[1],"type") == 0) thresh_array[nthresh] = TYPE;
     else if (strcmp(arg[1],"mass") == 0) thresh_array[nthresh] = MASS;
 
@@ -1876,6 +1884,16 @@ void DumpCustom::pack_proc(int n)
 
 /* ---------------------------------------------------------------------- */
 
+void DumpCustom::pack_procp1(int n)
+{
+  for (int i = 0; i < nchoose; i++) {
+    buf[n] = me+1;
+    n += size_one;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void DumpCustom::pack_type(int n)
 {
   int *type = atom->type;
diff --git a/src/dump_custom.h b/src/dump_custom.h
index c483d5b7e1..ea3c5ecff8 100644
--- a/src/dump_custom.h
+++ b/src/dump_custom.h
@@ -124,6 +124,7 @@ class DumpCustom : public Dump {
   void pack_id(int);
   void pack_molecule(int);
   void pack_proc(int);
+  void pack_procp1(int);
   void pack_type(int);
   void pack_mass(int);
 
diff --git a/src/fix_adapt.cpp b/src/fix_adapt.cpp
index 22e718dd76..246d828e7d 100644
--- a/src/fix_adapt.cpp
+++ b/src/fix_adapt.cpp
@@ -498,8 +498,6 @@ void FixAdapt::restore_settings()
 
     } else if (ad->which == ATOM) {
       if (diamflag) {
-        int mflag = 0;
-        if (atom->rmass_flag) mflag = 1;
         double density;
 
         double *vec = fix_diam->vstore;
diff --git a/src/fix_balance.cpp b/src/fix_balance.cpp
index e4b7b28b7d..c617d5d042 100644
--- a/src/fix_balance.cpp
+++ b/src/fix_balance.cpp
@@ -22,12 +22,14 @@
 #include "irregular.h"
 #include "force.h"
 #include "kspace.h"
+#include "rcb.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
-enum{SHIFT,RCB};
+enum{SHIFT,BISECTION};
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
@@ -53,7 +55,7 @@ FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
   thresh = force->numeric(FLERR,arg[4]);
 
   if (strcmp(arg[5],"shift") == 0) lbstyle = SHIFT;
-  else if (strcmp(arg[5],"rcb") == 0) lbstyle = RCB;
+  else if (strcmp(arg[5],"rcb") == 0) lbstyle = BISECTION;
   else error->all(FLERR,"Illegal fix balance command");
 
   int iarg = 5;
@@ -65,7 +67,7 @@ FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
     stopthresh = force->numeric(FLERR,arg[iarg+3]);
     if (stopthresh < 1.0) error->all(FLERR,"Illegal fix balance command");
     iarg += 4;
-  } else if (lbstyle == RCB) {
+  } else if (lbstyle == BISECTION) {
     iarg++;
   }
 
@@ -97,14 +99,16 @@ FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
     }
   }
 
-  // create instance of Balance class and initialize it with params
-  // NOTE: do I need Balance instance if RCB?
-  // create instance of Irregular class
+  if (lbstyle == BISECTION && comm->style == 0) 
+    error->all(FLERR,"Fix balance rcb cannot be used with comm_style brick");
+
+  // create instance of Balance class
+  // if SHIFT, initialize it with params
 
   balance = new Balance(lmp);
-
   if (lbstyle == SHIFT) balance->shift_setup(bstr,nitermax,thresh);
-  if (lbstyle == RCB) error->all(FLERR,"Fix balance rcb is not yet supported");
+
+  // create instance of Irregular class
 
   irregular = new Irregular(lmp);
 
@@ -238,32 +242,33 @@ void FixBalance::rebalance()
 {
   imbprev = imbnow;
 
+  // invoke balancer and reset comm->uniform flag
+
+  int *sendproc;
   if (lbstyle == SHIFT) {
     itercount = balance->shift();
-  } else if (lbstyle == RCB) {
+    comm->layout = LAYOUT_NONUNIFORM;
+  } else if (lbstyle == BISECTION) {
+    sendproc = balance->bisection();
+    comm->layout = LAYOUT_TILED;
   }
 
   // output of final result
 
   if (fp) balance->dumpout(update->ntimestep,fp);
 
-  // reset comm->uniform flag
-  // NOTE: this needs to change with RCB
-
-  comm->uniform = 0;
-
   // reset proc sub-domains
 
   if (domain->triclinic) domain->set_lamda_box();
   domain->set_local_box();
 
-  // if splits moved further than neighboring processor
   // move atoms to new processors via irregular()
-  // only needed if migrate_check() says an atom moves to far,
+  // only needed if migrate_check() says an atom moves to far
   // else allow caller's comm->exchange() to do it
 
   if (domain->triclinic) domain->x2lamda(atom->nlocal);
-  if (irregular->migrate_check()) irregular->migrate_atoms();
+  if (lbstyle == BISECTION) irregular->migrate_atoms(0,sendproc);
+  else if (irregular->migrate_check()) irregular->migrate_atoms();
   if (domain->triclinic) domain->lamda2x(atom->nlocal);
 
   // invoke KSpace setup_grid() to adjust to new proc sub-domains
@@ -303,5 +308,6 @@ double FixBalance::compute_vector(int i)
 double FixBalance::memory_usage()
 {
   double bytes = irregular->memory_usage();
+  if (balance->rcb) bytes += balance->rcb->memory_usage();
   return bytes;
 }
diff --git a/src/fix_deform.cpp b/src/fix_deform.cpp
index 3401c61a67..3d350c569a 100644
--- a/src/fix_deform.cpp
+++ b/src/fix_deform.cpp
@@ -987,3 +987,14 @@ void FixDeform::options(int narg, char **arg)
     } else error->all(FLERR,"Illegal fix deform command");
   }
 }
+
+/* ----------------------------------------------------------------------
+   memory usage of Irregular
+------------------------------------------------------------------------- */
+
+double FixDeform::memory_usage()
+{
+  double bytes = 0.0;
+  if (irregular) bytes += irregular->memory_usage();
+  return bytes;
+}
diff --git a/src/fix_deform.h b/src/fix_deform.h
index 7a781629dc..469fa87761 100644
--- a/src/fix_deform.h
+++ b/src/fix_deform.h
@@ -35,6 +35,7 @@ class FixDeform : public Fix {
   void init();
   void pre_exchange();
   void end_of_step();
+  double memory_usage();
 
  private:
   int triclinic,scaleflag,flipflag;
diff --git a/src/fix_nh.cpp b/src/fix_nh.cpp
index 1abc05c739..b254636111 100644
--- a/src/fix_nh.cpp
+++ b/src/fix_nh.cpp
@@ -2271,3 +2271,14 @@ void FixNH::pre_exchange()
     domain->lamda2x(atom->nlocal);
   }
 }
+
+/* ----------------------------------------------------------------------
+   memory usage of Irregular
+------------------------------------------------------------------------- */
+
+double FixNH::memory_usage()
+{
+  double bytes = 0.0;
+  if (irregular) bytes += irregular->memory_usage();
+  return bytes;
+}
diff --git a/src/fix_nh.h b/src/fix_nh.h
index 630c3cd68f..0e676b4a0d 100644
--- a/src/fix_nh.h
+++ b/src/fix_nh.h
@@ -39,6 +39,7 @@ class FixNH : public Fix {
   void reset_target(double);
   void reset_dt();
   virtual void *extract(const char*,int &);
+  double memory_usage();
 
  protected:
   int dimension,which;
diff --git a/src/input.cpp b/src/input.cpp
index b5affba078..f0bcfe7867 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -1161,19 +1161,15 @@ void Input::comm_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal comm_style command");
   if (strcmp(arg[0],"brick") == 0) {
-    if (comm->layout)
-      error->all(FLERR,
-                 "Cannot switch to comm style brick from "
-                 "irregular tiling of proc domains");
-    comm = new CommBrick(lmp);
-    // NOTE: this will lose load balancing info in old CommBrick
-    if (domain->box_exist) {
-      comm->set_proc_grid();
-      domain->set_local_box();
-    }
+    if (comm->style == 0) return;
+    Comm *oldcomm = comm;
+    comm = new CommBrick(lmp,oldcomm);
+    delete oldcomm;
   } else if (strcmp(arg[0],"tiled") == 0) {
-    error->all(FLERR,"Comm_style tiled not yet supported");
-    comm = new CommTiled(lmp);
+    if (comm->style == 1) return;
+    Comm *oldcomm = comm;
+    comm = new CommTiled(lmp,oldcomm);
+    delete oldcomm;
   } else error->all(FLERR,"Illegal comm_style command");
 }
 
diff --git a/src/irregular.cpp b/src/irregular.cpp
index b55d964327..9fd478a9da 100644
--- a/src/irregular.cpp
+++ b/src/irregular.cpp
@@ -30,6 +30,8 @@ using namespace LAMMPS_NS;
 int *Irregular::proc_recv_copy;
 int compare_standalone(const void *, const void *);
 
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
+
 #define BUFFACTOR 1.5
 #define BUFMIN 1000
 #define BUFEXTRA 1000
@@ -43,13 +45,26 @@ Irregular::Irregular(LAMMPS *lmp) : Pointers(lmp)
 
   triclinic = domain->triclinic;
   map_style = atom->map_style;
-  procgrid = comm->procgrid;
-  grid2proc = comm->grid2proc;
 
-  aplan = NULL;
-  dplan = NULL;
+  // migrate work vectors
 
-  // initialize buffers for atom comm, not used for datum comm
+  maxlocal = 0;
+  mproclist = NULL;
+  msizes = NULL;
+
+  // send buffers
+
+  maxdbuf = 0;
+  dbuf = NULL;
+  maxbuf = 0;
+  buf = NULL;
+
+  // universal work vectors
+
+  memory->create(work1,nprocs,"irregular:work1");
+  memory->create(work2,nprocs,"irregular:work2");
+
+  // initialize buffers for migrate atoms, not used for datum comm
   // these can persist for multiple irregular operations
 
   maxsend = BUFMIN;
@@ -62,9 +77,12 @@ Irregular::Irregular(LAMMPS *lmp) : Pointers(lmp)
 
 Irregular::~Irregular()
 {
-  if (aplan) destroy_atom();
-  if (dplan) destroy_data();
-
+  memory->destroy(mproclist);
+  memory->destroy(msizes);
+  memory->destroy(dbuf);
+  memory->destroy(buf);
+  memory->destroy(work1);
+  memory->destroy(work2);
   memory->destroy(buf_send);
   memory->destroy(buf_recv);
 }
@@ -74,11 +92,13 @@ Irregular::~Irregular()
    can be used in place of comm->exchange()
    unlike exchange(), allows atoms to have moved arbitrarily long distances
    sets up irregular plan, invokes it, destroys it
+   sortflag = flag for sorting order of received messages by proc ID
+   procassign = non-NULL if already know procs atoms are assigned to (from RCB)
    atoms MUST be remapped to be inside simulation box before this is called
    for triclinic: atoms must be in lamda coords (0-1) before this is called
 ------------------------------------------------------------------------- */
 
-void Irregular::migrate_atoms(int sortflag)
+void Irregular::migrate_atoms(int sortflag, int *procassign)
 {
   // clear global->local map since atoms move to new procs
   // clear old ghosts so map_set() at end will operate only on local atoms
@@ -101,13 +121,16 @@ void Irregular::migrate_atoms(int sortflag)
     subhi = domain->subhi_lamda;
   }
 
-  uniform = comm->uniform;
+  layout = comm->layout;
   xsplit = comm->xsplit;
   ysplit = comm->ysplit;
   zsplit = comm->zsplit;
   boxlo = domain->boxlo;
   prd = domain->prd;
 
+  procgrid = comm->procgrid;
+  grid2proc = comm->grid2proc;
+
   // loop over atoms, flag any that are not in my sub-box
   // fill buffer with atoms leaving my box, using < and >=
   // assign which proc it belongs to via coord2proc()
@@ -119,41 +142,63 @@ void Irregular::migrate_atoms(int sortflag)
   double **x = atom->x;
   int nlocal = atom->nlocal;
 
+  if (nlocal > maxlocal) {
+    maxlocal = nlocal;
+    memory->destroy(mproclist);
+    memory->destroy(msizes);
+    memory->create(mproclist,maxlocal,"irregular:mproclist");
+    memory->create(msizes,maxlocal,"irregular:msizes");
+  }
+
+  int igx,igy,igz;
   int nsend = 0;
   int nsendatom = 0;
-  int *sizes = new int[nlocal];
-  int *proclist = new int[nlocal];
-  int igx,igy,igz;
-
   int i = 0;
-  while (i < nlocal) {
-    if (x[i][0] < sublo[0] || x[i][0] >= subhi[0] ||
-        x[i][1] < sublo[1] || x[i][1] >= subhi[1] ||
-        x[i][2] < sublo[2] || x[i][2] >= subhi[2]) {
-      proclist[nsendatom] = coord2proc(x[i],igx,igy,igz);
-      if (proclist[nsendatom] != me) {
+
+  if (procassign) {
+    while (i < nlocal) {
+      if (procassign[i] == me) i++;
+      else {
+        mproclist[nsendatom] = procassign[i];
         if (nsend > maxsend) grow_send(nsend,1);
-        sizes[nsendatom] = avec->pack_exchange(i,&buf_send[nsend]);
-        nsend += sizes[nsendatom];
+        msizes[nsendatom] = avec->pack_exchange(i,&buf_send[nsend]);
+        nsend += msizes[nsendatom];
         nsendatom++;
         avec->copy(nlocal-1,i,1);
+        procassign[i] = procassign[nlocal-1];
         nlocal--;
+      }
+    }
+
+  } else {
+    while (i < nlocal) {
+      if (x[i][0] < sublo[0] || x[i][0] >= subhi[0] ||
+          x[i][1] < sublo[1] || x[i][1] >= subhi[1] ||
+          x[i][2] < sublo[2] || x[i][2] >= subhi[2]) {
+        mproclist[nsendatom] = coord2proc(x[i],igx,igy,igz);
+        if (mproclist[nsendatom] == me) i++;
+        else {
+          if (nsend > maxsend) grow_send(nsend,1);
+          msizes[nsendatom] = avec->pack_exchange(i,&buf_send[nsend]);
+          nsend += msizes[nsendatom];
+          nsendatom++;
+          avec->copy(nlocal-1,i,1);
+          nlocal--;
+        }
       } else i++;
-    } else i++;
+    }
   }
+
   atom->nlocal = nlocal;
 
   // create irregular communication plan, perform comm, destroy plan
   // returned nrecv = size of buffer needed for incoming atoms
 
-  int nrecv = create_atom(nsendatom,sizes,proclist,sortflag);
+  int nrecv = create_atom(nsendatom,msizes,mproclist,sortflag);
   if (nrecv > maxrecv) grow_recv(nrecv);
-  exchange_atom(buf_send,sizes,buf_recv);
+  exchange_atom(buf_send,msizes,buf_recv);
   destroy_atom();
 
-  delete [] sizes;
-  delete [] proclist;
-
   // add received atoms to my list
 
   int m = 0;
@@ -174,6 +219,11 @@ void Irregular::migrate_atoms(int sortflag)
 
 int Irregular::migrate_check()
 {
+  // migrate required if comm layout is tiled
+  // cannot use myloc[] logic below
+
+  if (comm->layout == LAYOUT_TILED) return 1;
+
   // subbox bounds for orthogonal or triclinic box
   // other comm/domain data used by coord2proc()
 
@@ -186,13 +236,16 @@ int Irregular::migrate_check()
     subhi = domain->subhi_lamda;
   }
 
-  uniform = comm->uniform;
+  layout = comm->layout;
   xsplit = comm->xsplit;
   ysplit = comm->ysplit;
   zsplit = comm->zsplit;
   boxlo = domain->boxlo;
   prd = domain->prd;
 
+  procgrid = comm->procgrid;
+  grid2proc = comm->grid2proc;
+
   // loop over atoms, check for any that are not in my sub-box
   // assign which proc it belongs to via coord2proc()
   // if logical igx,igy,igz of newproc > one away from myloc, set flag = 1
@@ -250,6 +303,7 @@ int Irregular::migrate_check()
    n = # of atoms to send
    sizes = # of doubles for each atom
    proclist = proc to send each atom to (not including self)
+   sortflag = flag for sorting order of received messages by proc ID
    return total # of doubles I will recv (not including self)
 ------------------------------------------------------------------------- */
 
@@ -257,51 +311,60 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
 {
   int i;
 
-  // allocate plan and work vectors
-
-  if (aplan) destroy_atom();
-  aplan = (PlanAtom *) memory->smalloc(sizeof(PlanAtom),"irregular:aplan");
-  int *list = new int[nprocs];
-  int *count = new int[nprocs];
-
-  // nrecv = # of messages I receive
+  // setup for collective comm
+  // work1 = 1 for procs I send a message to, not including self
+  // work2 = 1 for all procs, used for ReduceScatter
 
   for (i = 0; i < nprocs; i++) {
-    list[i] = 0;
-    count[i] = 1;
+    work1[i] = 0;
+    work2[i] = 1;
   }
-  for (i = 0; i < n; i++) list[proclist[i]] = 1;
+  for (i = 0; i < n; i++) work1[proclist[i]] = 1;
+  work1[me] = 0;
 
-  int nrecv;
-  MPI_Reduce_scatter(list,&nrecv,count,MPI_INT,MPI_SUM,world);
+  // nrecv_proc = # of procs I receive messages from, not including self
+  // options for performing ReduceScatter operation
+  // some are more efficient on some machines at big sizes
+
+#ifdef LAMMPS_RS_ALLREDUCE_INPLACE
+  MPI_Allreduce(MPI_IN_PLACE,work1,nprocs,MPI_INT,MPI_SUM,world);
+  nrecv_proc = work1[me];
+#else 
+#ifdef LAMMPS_RS_ALLREDUCE
+  MPI_Allreduce(work1,work2,nprocs,MPI_INT,MPI_SUM,world);
+  nrecv_proc = work2[me];
+#else
+  MPI_Reduce_scatter(work1,&nrecv_proc,work2,MPI_INT,MPI_SUM,world);
+#endif
+#endif
 
   // allocate receive arrays
 
-  int *proc_recv = new int[nrecv];
-  int *length_recv = new int[nrecv];
-  MPI_Request *request = new MPI_Request[nrecv];
-  MPI_Status *status = new MPI_Status[nrecv];
+  proc_recv = new int[nrecv_proc];
+  length_recv = new int[nrecv_proc];
+  request = new MPI_Request[nrecv_proc];
+  status = new MPI_Status[nrecv_proc];
 
-  // nsend = # of messages I send
+  // nsend_proc = # of messages I send
 
-  for (i = 0; i < nprocs; i++) list[i] = 0;
-  for (i = 0; i < n; i++) list[proclist[i]] += sizes[i];
+  for (i = 0; i < nprocs; i++) work1[i] = 0;
+  for (i = 0; i < n; i++) work1[proclist[i]] += sizes[i];
 
-  int nsend = 0;
+  nsend_proc = 0;
   for (i = 0; i < nprocs; i++)
-    if (list[i]) nsend++;
+    if (work1[i]) nsend_proc++;
 
   // allocate send arrays
 
-  int *proc_send = new int[nsend];
-  int *length_send = new int[nsend];
-  int *num_send = new int[nsend];
-  int *index_send = new int[n];
-  int *offset_send = new int[n];
+  proc_send = new int[nsend_proc];
+  length_send = new int[nsend_proc];
+  num_send = new int[nsend_proc];
+  index_send = new int[n];
+  offset_send = new int[n];
 
   // list still stores size of message for procs I send to
   // proc_send = procs I send to
-  // length_send = total size of message I send to each proc
+  // length_send = # of doubles I send to each proc
   // to balance pattern of send messages:
   //   each proc begins with iproc > me, continues until iproc = me
   // reset list to store which send message each proc corresponds to
@@ -311,81 +374,81 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
   for (i = 0; i < nprocs; i++) {
     iproc++;
     if (iproc == nprocs) iproc = 0;
-    if (list[iproc] > 0) {
+    if (work1[iproc] > 0) {
       proc_send[isend] = iproc;
-      length_send[isend] = list[iproc];
-      list[iproc] = isend;
+      length_send[isend] = work1[iproc];
+      work1[iproc] = isend;
       isend++;
     }
   }
 
   // num_send = # of atoms I send to each proc
 
-  for (i = 0; i < nsend; i++) num_send[i] = 0;
+  for (i = 0; i < nsend_proc; i++) num_send[i] = 0;
   for (i = 0; i < n; i++) {
-    isend = list[proclist[i]];
+    isend = work1[proclist[i]];
     num_send[isend]++;
   }
 
-  // count = offsets into index_send for each proc I send to
+  // work2 = offsets into index_send for each proc I send to
   // index_send = list of which atoms to send to each proc
   //   1st N1 values are atom indices for 1st proc,
   //   next N2 values are atom indices for 2nd proc, etc
   // offset_send = where each atom starts in send buffer
 
-  count[0] = 0;
-  for (i = 1; i < nsend; i++) count[i] = count[i-1] + num_send[i-1];
+  work2[0] = 0;
+  for (i = 1; i < nsend_proc; i++) work2[i] = work2[i-1] + num_send[i-1];
 
   for (i = 0; i < n; i++) {
-    isend = list[proclist[i]];
-    index_send[count[isend]++] = i;
+    isend = work1[proclist[i]];
+    index_send[work2[isend]++] = i;
     if (i) offset_send[i] = offset_send[i-1] + sizes[i-1];
     else offset_send[i] = 0;
   }
 
   // tell receivers how much data I send
-  // sendmax = largest # of doubles I send in a single message
+  // sendmax_proc = # of doubles I send in largest single message
 
-  int sendmax = 0;
-  for (i = 0; i < nsend; i++) {
+  sendmax_proc = 0;
+  for (i = 0; i < nsend_proc; i++) {
     MPI_Send(&length_send[i],1,MPI_INT,proc_send[i],0,world);
-    sendmax = MAX(sendmax,length_send[i]);
+    sendmax_proc = MAX(sendmax_proc,length_send[i]);
   }
 
   // receive incoming messages
   // proc_recv = procs I recv from
-  // length_recv = total size of message each proc sends me
-  // nrecvsize = total size of data I recv
+  // length_recv = # of doubles each proc sends me
+  // nrecvsize = total size of atom data I recv
 
   int nrecvsize = 0;
-  for (i = 0; i < nrecv; i++) {
+  for (i = 0; i < nrecv_proc; i++) {
     MPI_Recv(&length_recv[i],1,MPI_INT,MPI_ANY_SOURCE,0,world,status);
     proc_recv[i] = status->MPI_SOURCE;
     nrecvsize += length_recv[i];
   }
 
-  // sort proc_recv and num_recv by proc ID if requested
+  // sort proc_recv and length_recv by proc ID if requested
   // useful for debugging to insure reproducible ordering of received atoms
   // invoke by adding final arg = 1 to create_atom() call in migrate_atoms()
 
   if (sortflag) {
-    int *order = new int[nrecv];
-    int *proc_recv_ordered = new int[nrecv];
-    int *length_recv_ordered = new int[nrecv];
+    int *order = new int[nrecv_proc];
+    int *proc_recv_ordered = new int[nrecv_proc];
+    int *length_recv_ordered = new int[nrecv_proc];
 
-    for (i = 0; i < nrecv; i++) order[i] = i;
+    for (i = 0; i < nrecv_proc; i++) order[i] = i;
     proc_recv_copy = proc_recv;
-    qsort(order,nrecv,sizeof(int),compare_standalone);
+    qsort(order,nrecv_proc,sizeof(int),compare_standalone);
 
     int j;
-    for (i = 0; i < nrecv; i++) {
+    for (i = 0; i < nrecv_proc; i++) {
       j = order[i];
       proc_recv_ordered[i] = proc_recv[j];
       length_recv_ordered[i] = length_recv[j];
     }
 
-    memcpy(proc_recv,proc_recv_ordered,nrecv*sizeof(int));
-    memcpy(length_recv,length_recv_ordered,nrecv*sizeof(int));
+    memcpy(proc_recv,proc_recv_ordered,nrecv_proc*sizeof(int));
+    memcpy(length_recv,length_recv_ordered,nrecv_proc*sizeof(int));
     delete [] order;
     delete [] proc_recv_ordered;
     delete [] length_recv_ordered;
@@ -396,27 +459,7 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
 
   MPI_Barrier(world);
 
-  // free work vectors
-
-  delete [] count;
-  delete [] list;
-
-  // initialize plan
-
-  aplan->nsend = nsend;
-  aplan->nrecv = nrecv;
-  aplan->sendmax = sendmax;
-
-  aplan->proc_send = proc_send;
-  aplan->length_send = length_send;
-  aplan->num_send = num_send;
-  aplan->index_send = index_send;
-  aplan->offset_send = offset_send;
-  aplan->proc_recv = proc_recv;
-  aplan->length_recv = length_recv;
-
-  aplan->request = request;
-  aplan->status = status;
+  // return size of atom data I will receive
 
   return nrecvsize;
 }
@@ -445,217 +488,226 @@ int compare_standalone(const void *iptr, const void *jptr)
 
 void Irregular::exchange_atom(double *sendbuf, int *sizes, double *recvbuf)
 {
-  int i,m,n,offset,num_send;
+  int i,m,n,offset,count;
 
   // post all receives
 
   offset = 0;
-  for (int irecv = 0; irecv < aplan->nrecv; irecv++) {
-    MPI_Irecv(&recvbuf[offset],aplan->length_recv[irecv],MPI_DOUBLE,
-              aplan->proc_recv[irecv],0,world,&aplan->request[irecv]);
-    offset += aplan->length_recv[irecv];
+  for (int irecv = 0; irecv < nrecv_proc; irecv++) {
+    MPI_Irecv(&recvbuf[offset],length_recv[irecv],MPI_DOUBLE,
+              proc_recv[irecv],0,world,&request[irecv]);
+    offset += length_recv[irecv];
   }
 
-  // allocate buf for largest send
+  // reallocate buf for largest send if necessary
 
-  double *buf;
-  memory->create(buf,aplan->sendmax,"irregular:buf");
+  if (sendmax_proc > maxdbuf) {
+    memory->destroy(dbuf);
+    maxdbuf = sendmax_proc;
+    memory->create(dbuf,maxdbuf,"irregular:dbuf");
+  }
 
   // send each message
   // pack buf with list of atoms
   // m = index of atom in sendbuf
 
-  int *index_send = aplan->index_send;
-  int nsend = aplan->nsend;
   n = 0;
-
-  for (int isend = 0; isend < nsend; isend++) {
+  for (int isend = 0; isend < nsend_proc; isend++) {
     offset = 0;
-    num_send = aplan->num_send[isend];
-    for (i = 0; i < num_send; i++) {
+    count = num_send[isend];
+    for (i = 0; i < count; i++) {
       m = index_send[n++];
-      memcpy(&buf[offset],&sendbuf[aplan->offset_send[m]],
-             sizes[m]*sizeof(double));
+      memcpy(&dbuf[offset],&sendbuf[offset_send[m]],sizes[m]*sizeof(double));
       offset += sizes[m];
     }
-    MPI_Send(buf,aplan->length_send[isend],MPI_DOUBLE,
-             aplan->proc_send[isend],0,world);
+    MPI_Send(dbuf,length_send[isend],MPI_DOUBLE,proc_send[isend],0,world);
   }
 
-  // free temporary send buffer
-
-  memory->destroy(buf);
-
   // wait on all incoming messages
 
-  if (aplan->nrecv) MPI_Waitall(aplan->nrecv,aplan->request,aplan->status);
+  if (nrecv_proc) MPI_Waitall(nrecv_proc,request,status);
 }
 
 /* ----------------------------------------------------------------------
-   destroy communication plan for atoms
+   destroy vectors in communication plan for atoms
 ------------------------------------------------------------------------- */
 
 void Irregular::destroy_atom()
 {
-  delete [] aplan->proc_send;
-  delete [] aplan->length_send;
-  delete [] aplan->num_send;
-  delete [] aplan->index_send;
-  delete [] aplan->offset_send;
-  delete [] aplan->proc_recv;
-  delete [] aplan->length_recv;
-  delete [] aplan->request;
-  delete [] aplan->status;
-  memory->sfree(aplan);
-  aplan = NULL;
+  delete [] proc_send;
+  delete [] length_send;
+  delete [] num_send;
+  delete [] index_send;
+  delete [] offset_send;
+  delete [] proc_recv;
+  delete [] length_recv;
+  delete [] request;
+  delete [] status;
 }
 
 /* ----------------------------------------------------------------------
-   create a communication plan for datums
+   create communication plan based on list of datums of uniform size
    n = # of datums to send
-   proclist = proc to send each datum to (including self)
-   return total # of datums I will recv (including self)
+   proclist = proc to send each datum to, can include self
+   sortflag = flag for sorting order of received messages by proc ID
+   return total # of datums I will recv, including any to self
 ------------------------------------------------------------------------- */
 
-int Irregular::create_data(int n, int *proclist)
+int Irregular::create_data(int n, int *proclist, int sortflag)
 {
   int i,m;
 
-  // allocate plan and work vectors
-
-  dplan = (PlanData *) memory->smalloc(sizeof(PlanData),"irregular:dplan");
-  int *list = new int[nprocs];
-  int *count = new int[nprocs];
-
-  // nrecv = # of messages I receive
+  // setup for collective comm
+  // work1 = 1 for procs I send a message to, not including self
+  // work2 = 1 for all procs, used for ReduceScatter
 
   for (i = 0; i < nprocs; i++) {
-    list[i] = 0;
-    count[i] = 1;
+    work1[i] = 0;
+    work2[i] = 1;
   }
-  for (i = 0; i < n; i++) list[proclist[i]] = 1;
+  for (i = 0; i < n; i++) work1[proclist[i]] = 1;
+  work1[me] = 0;
 
-  int nrecv;
-  MPI_Reduce_scatter(list,&nrecv,count,MPI_INT,MPI_SUM,world);
-  if (list[me]) nrecv--;
+  // nrecv_proc = # of procs I receive messages from, not including self
+  // options for performing ReduceScatter operation
+  // some are more efficient on some machines at big sizes
+
+#ifdef LAMMPS_RS_ALLREDUCE_INPLACE
+  MPI_Allreduce(MPI_IN_PLACE,work1,nprocs,MPI_INT,MPI_SUM,world);
+  nrecv_proc = work1[me];
+#else 
+#ifdef LAMMPS_RS_ALLREDUCE
+  MPI_Allreduce(work1,work2,nprocs,MPI_INT,MPI_SUM,world);
+  nrecv_proc = work2[me];
+#else
+  MPI_Reduce_scatter(work1,&nrecv_proc,work2,MPI_INT,MPI_SUM,world);
+#endif
+#endif
 
   // allocate receive arrays
 
-  int *proc_recv = new int[nrecv];
-  int *num_recv = new int[nrecv];
-  MPI_Request *request = new MPI_Request[nrecv];
-  MPI_Status *status = new MPI_Status[nrecv];
+  proc_recv = new int[nrecv_proc];
+  num_recv = new int[nrecv_proc];
+  request = new MPI_Request[nrecv_proc];
+  status = new MPI_Status[nrecv_proc];
 
-  // nsend = # of messages I send
+  // work1 = # of datums I send to each proc, including self
+  // nsend_proc = # of procs I send messages to, not including self
 
-  for (i = 0; i < nprocs; i++) list[i] = 0;
-  for (i = 0; i < n; i++) list[proclist[i]]++;
+  for (i = 0; i < nprocs; i++) work1[i] = 0;
+  for (i = 0; i < n; i++) work1[proclist[i]]++;
 
-  int nsend = 0;
+  nsend_proc = 0;
   for (i = 0; i < nprocs; i++)
-    if (list[i]) nsend++;
-  if (list[me]) nsend--;
+    if (work1[i]) nsend_proc++;
+  if (work1[me]) nsend_proc--;
 
   // allocate send and self arrays
 
-  int *proc_send = new int[nsend];
-  int *num_send = new int[nsend];
-  int *index_send = new int[n-list[me]];
-  int *index_self = new int[list[me]];
+  proc_send = new int[nsend_proc];
+  num_send = new int[nsend_proc];
+  index_send = new int[n-work1[me]];
+  index_self = new int[work1[me]];
 
   // proc_send = procs I send to
   // num_send = # of datums I send to each proc
   // num_self = # of datums I copy to self
   // to balance pattern of send messages:
   //   each proc begins with iproc > me, continues until iproc = me
-  // reset list to store which send message each proc corresponds to
-
-  int num_self;
+  // reset work1 to store which send message each proc corresponds to
 
   int iproc = me;
   int isend = 0;
   for (i = 0; i < nprocs; i++) {
     iproc++;
     if (iproc == nprocs) iproc = 0;
-    if (iproc == me) num_self = list[iproc];
-    else if (list[iproc] > 0) {
+    if (iproc == me) {
+      num_self = work1[iproc];
+      work1[iproc] = 0;
+    } else if (work1[iproc] > 0) {
       proc_send[isend] = iproc;
-      num_send[isend] = list[iproc];
-      list[iproc] = isend;
+      num_send[isend] = work1[iproc];
+      work1[iproc] = isend;
       isend++;
     }
   }
-  list[me] = 0;
 
-  // count = offsets into index_send for each proc I send to
+  // work2 = offsets into index_send for each proc I send to
   // m = ptr into index_self
   // index_send = list of which datums to send to each proc
   //   1st N1 values are datum indices for 1st proc,
   //   next N2 values are datum indices for 2nd proc, etc
+  // index_self = list of which datums to copy to self
 
-  count[0] = 0;
-  for (i = 1; i < nsend; i++) count[i] = count[i-1] + num_send[i-1];
+  work2[0] = 0;
+  for (i = 1; i < nsend_proc; i++) work2[i] = work2[i-1] + num_send[i-1];
 
   m = 0;
   for (i = 0; i < n; i++) {
     iproc = proclist[i];
     if (iproc == me) index_self[m++] = i;
     else {
-      isend = list[iproc];
-      index_send[count[isend]++] = i;
+      isend = work1[iproc];
+      index_send[work2[isend]++] = i;
     }
   }
 
   // tell receivers how much data I send
-  // sendmax = largest # of datums I send in a single message
+  // sendmax_proc = largest # of datums I send in a single message
 
-  int sendmax = 0;
-  for (i = 0; i < nsend; i++) {
+  sendmax_proc = 0;
+  for (i = 0; i < nsend_proc; i++) {
     MPI_Send(&num_send[i],1,MPI_INT,proc_send[i],0,world);
-    sendmax = MAX(sendmax,num_send[i]);
+    sendmax_proc = MAX(sendmax_proc,num_send[i]);
   }
 
   // receive incoming messages
   // proc_recv = procs I recv from
   // num_recv = total size of message each proc sends me
-  // nrecvsize = total size of data I recv
+  // nrecvdatum = total size of data I recv
 
-  int nrecvsize = 0;
-  for (i = 0; i < nrecv; i++) {
+  int nrecvdatum = 0;
+  for (i = 0; i < nrecv_proc; i++) {
     MPI_Recv(&num_recv[i],1,MPI_INT,MPI_ANY_SOURCE,0,world,status);
     proc_recv[i] = status->MPI_SOURCE;
-    nrecvsize += num_recv[i];
+    nrecvdatum += num_recv[i];
+  }
+  nrecvdatum += num_self;
+
+  // sort proc_recv and num_recv by proc ID if requested
+  // useful for debugging to insure reproducible ordering of received datums
+
+  if (sortflag) {
+    int *order = new int[nrecv_proc];
+    int *proc_recv_ordered = new int[nrecv_proc];
+    int *num_recv_ordered = new int[nrecv_proc];
+
+    for (i = 0; i < nrecv_proc; i++) order[i] = i;
+    proc_recv_copy = proc_recv;
+    qsort(order,nrecv_proc,sizeof(int),compare_standalone);
+
+    int j;
+    for (i = 0; i < nrecv_proc; i++) {
+      j = order[i];
+      proc_recv_ordered[i] = proc_recv[j];
+      num_recv_ordered[i] = num_recv[j];
+    }
+
+    memcpy(proc_recv,proc_recv_ordered,nrecv_proc*sizeof(int));
+    memcpy(num_recv,num_recv_ordered,nrecv_proc*sizeof(int));
+    delete [] order;
+    delete [] proc_recv_ordered;
+    delete [] num_recv_ordered;
   }
-  nrecvsize += num_self;
 
   // barrier to insure all MPI_ANY_SOURCE messages are received
   // else another proc could proceed to exchange_data() and send to me
 
   MPI_Barrier(world);
 
-  // free work vectors
+  // return # of datums I will receive
 
-  delete [] count;
-  delete [] list;
-
-  // initialize plan and return it
-
-  dplan->nsend = nsend;
-  dplan->nrecv = nrecv;
-  dplan->sendmax = sendmax;
-
-  dplan->proc_send = proc_send;
-  dplan->num_send = num_send;
-  dplan->index_send = index_send;
-  dplan->proc_recv = proc_recv;
-  dplan->num_recv = num_recv;
-  dplan->num_self = num_self;
-  dplan->index_self = index_self;
-
-  dplan->request = request;
-  dplan->status = status;
-
-  return nrecvsize;
+  return nrecvdatum;
 }
 
 /* ----------------------------------------------------------------------
@@ -667,49 +719,41 @@ int Irregular::create_data(int n, int *proclist)
 
 void Irregular::exchange_data(char *sendbuf, int nbytes, char *recvbuf)
 {
-  int i,m,n,offset,num_send;
+  int i,m,n,offset,count;
 
   // post all receives, starting after self copies
 
-  offset = dplan->num_self*nbytes;
-  for (int irecv = 0; irecv < dplan->nrecv; irecv++) {
-    MPI_Irecv(&recvbuf[offset],dplan->num_recv[irecv]*nbytes,MPI_CHAR,
-              dplan->proc_recv[irecv],0,world,&dplan->request[irecv]);
-    offset += dplan->num_recv[irecv]*nbytes;
+  offset = num_self*nbytes;
+  for (int irecv = 0; irecv < nrecv_proc; irecv++) {
+    MPI_Irecv(&recvbuf[offset],num_recv[irecv]*nbytes,MPI_CHAR,
+              proc_recv[irecv],0,world,&request[irecv]);
+    offset += num_recv[irecv]*nbytes;
   }
 
-  // allocate buf for largest send
+  // reallocate buf for largest send if necessary
 
-  char *buf;
-  memory->create(buf,dplan->sendmax*nbytes,"irregular:buf");
+  if (sendmax_proc*nbytes > maxbuf) {
+    memory->destroy(buf);
+    maxbuf = sendmax_proc*nbytes;
+    memory->create(buf,maxbuf,"irregular:buf");
+  }
 
   // send each message
   // pack buf with list of datums
   // m = index of datum in sendbuf
 
-  int *index_send = dplan->index_send;
-  int nsend = dplan->nsend;
   n = 0;
-
-  for (int isend = 0; isend < nsend; isend++) {
-    num_send = dplan->num_send[isend];
-    for (i = 0; i < num_send; i++) {
+  for (int isend = 0; isend < nsend_proc; isend++) {
+    count = num_send[isend];
+    for (i = 0; i < count; i++) {
       m = index_send[n++];
       memcpy(&buf[i*nbytes],&sendbuf[m*nbytes],nbytes);
     }
-    MPI_Send(buf,dplan->num_send[isend]*nbytes,MPI_CHAR,
-             dplan->proc_send[isend],0,world);
+    MPI_Send(buf,count*nbytes,MPI_CHAR,proc_send[isend],0,world);
   }
 
-  // free temporary send buffer
-
-  memory->destroy(buf);
-
   // copy datums to self, put at beginning of recvbuf
 
-  int *index_self = dplan->index_self;
-  int num_self = dplan->num_self;
-
   for (i = 0; i < num_self; i++) {
     m = index_self[i];
     memcpy(&recvbuf[i*nbytes],&sendbuf[m*nbytes],nbytes);
@@ -717,39 +761,37 @@ void Irregular::exchange_data(char *sendbuf, int nbytes, char *recvbuf)
 
   // wait on all incoming messages
 
-  if (dplan->nrecv) MPI_Waitall(dplan->nrecv,dplan->request,dplan->status);
+  if (nrecv_proc) MPI_Waitall(nrecv_proc,request,status);
 }
 
 /* ----------------------------------------------------------------------
-   destroy communication plan for datums
+   destroy vectors in communication plan for datums
 ------------------------------------------------------------------------- */
 
 void Irregular::destroy_data()
 {
-  delete [] dplan->proc_send;
-  delete [] dplan->num_send;
-  delete [] dplan->index_send;
-  delete [] dplan->proc_recv;
-  delete [] dplan->num_recv;
-  delete [] dplan->index_self;
-  delete [] dplan->request;
-  delete [] dplan->status;
-  memory->sfree(dplan);
-  dplan = NULL;
+  delete [] proc_send;
+  delete [] num_send;
+  delete [] index_send;
+  delete [] proc_recv;
+  delete [] num_recv;
+  delete [] index_self;
+  delete [] request;
+  delete [] status;
 }
 
 /* ----------------------------------------------------------------------
    determine which proc owns atom with coord x[3]
    x will be in box (orthogonal) or lamda coords (triclinic)
-   for uniform = 1, directly calculate owning proc
-   for non-uniform, iteratively find owning proc via binary search
+   if layout = UNIFORM, calculate owning proc directly
+   else layout = NONUNIFORM, iteratively find owning proc via binary search
    return owning proc ID via grid2proc
    return igx,igy,igz = logical grid loc of owing proc within 3d grid of procs
 ------------------------------------------------------------------------- */
 
 int Irregular::coord2proc(double *x, int &igx, int &igy, int &igz)
 {
-  if (uniform) {
+  if (layout == 0) {
     if (triclinic == 0) {
       igx = static_cast<int> (procgrid[0] * (x[0]-boxlo[0]) / prd[0]);
       igy = static_cast<int> (procgrid[1] * (x[1]-boxlo[1]) / prd[1]);
@@ -846,7 +888,12 @@ void Irregular::grow_recv(int n)
 
 bigint Irregular::memory_usage()
 {
-  bigint bytes = memory->usage(buf_send,maxsend);
-  bytes += memory->usage(buf_recv,maxrecv);
+  bigint bytes = 0;
+  bytes += maxsend*sizeof(double);   // buf_send
+  bytes += maxrecv*sizeof(double);   // buf_recv
+  bytes += maxdbuf*sizeof(double);   // dbuf
+  bytes += maxbuf;                   // buf
+  bytes += 2*maxlocal*sizeof(int);   // mproclist,msizes
+  bytes += 2*nprocs*sizeof(int);     // work1,work2
   return bytes;
 }
diff --git a/src/irregular.h b/src/irregular.h
index 576fad5c90..1fdbbe03f7 100644
--- a/src/irregular.h
+++ b/src/irregular.h
@@ -27,9 +27,9 @@ class Irregular : protected Pointers {
 
   Irregular(class LAMMPS *);
   ~Irregular();
-  void migrate_atoms(int sortflag = 0);
+  void migrate_atoms(int sortflag = 0, int *procassign = NULL);
   int migrate_check();
-  int create_data(int, int *);
+  int create_data(int, int *, int sortflag = 0);
   void exchange_data(char *, int, char *);
   void destroy_data();
   bigint memory_usage();
@@ -38,58 +38,58 @@ class Irregular : protected Pointers {
   int me,nprocs;
   int triclinic;
   int map_style;
-  int uniform;
+  int layout;
   double *xsplit,*ysplit,*zsplit;   // ptrs to comm
   int *procgrid;                    // ptr to comm
   int ***grid2proc;                 // ptr to comm
   double *boxlo;                    // ptr to domain
   double *prd;                      // ptr to domain
 
-  int maxsend,maxrecv;              // size of buffers in # of doubles
-  double *buf_send,*buf_recv;
+  int maxsend,maxrecv;              // size of buf send/recv in # of doubles
+  double *buf_send,*buf_recv;       // bufs used in migrate_atoms
+  int maxdbuf;                      // size of double buf in bytes
+  double *dbuf;                     // double buf for largest single atom send
+  int maxbuf;                       // size of char buf in bytes
+  char *buf;                        // char buf for largest single data send
 
-  // plan for irregular communication of atoms
+  int *mproclist,*msizes;           // persistent vectors in migrate_atoms
+  int maxlocal;                     // allocated size of mproclist and msizes
+
+  int *work1,*work2;                // work vectors
+
+  // plan params for irregular communication of atoms or datums
+  // no params refer to atoms/data copied to self
+
+  int nsend_proc;            // # of messages to send
+  int nrecv_proc;            // # of messages to recv
+  int sendmax_proc;          // # of doubles/datums in largest send message
+  int *proc_send;            // list of procs to send to
+  int *num_send;             // # of atoms/datums to send to each proc
+  int *index_send;           // list of which atoms/datums to send to each proc
+  int *proc_recv;            // list of procs to recv from
+  MPI_Request *request;      // MPI requests for posted recvs
+  MPI_Status *status;        // MPI statuses for WaitAll
+
+  // extra plan params plan for irregular communication of atoms
   // no params refer to atoms copied to self
 
-  struct PlanAtom {
-    int nsend;                 // # of messages to send
-    int nrecv;                 // # of messages to recv
-    int sendmax;               // # of doubles in largest send message
-    int *proc_send;            // procs to send to
-    int *length_send;          // # of doubles to send to each proc
-    int *num_send;             // # of atoms to send to each proc
-    int *index_send;           // list of which atoms to send to each proc
-    int *offset_send;          // where each atom starts in send buffer
-    int *proc_recv;            // procs to recv from
-    int *length_recv;          // # of doubles to recv from each proc
-    MPI_Request *request;      // MPI requests for posted recvs
-    MPI_Status *status;        // MPI statuses for WaitAll
-  };
+  int *length_send;          // # of doubles to send to each proc
+  int *length_recv;          // # of doubles to recv from each proc
+  int *offset_send;          // where each atom starts in send buffer
 
-  // plan for irregular communication of datums
-  // only 2 self params refer to atoms copied to self
+  // extra plan params plan for irregular communication of datums 
+  // 2 self params refer to data copied to self
 
-  struct PlanData {            // plan for irregular communication of data
-    int nsend;                 // # of messages to send
-    int nrecv;                 // # of messages to recv
-    int sendmax;               // # of datums in largest send message
-    int *proc_send;            // procs to send to
-    int *num_send;             // # of datums to send to each proc
-    int *index_send;           // list of which datums to send to each proc
-    int *proc_recv;            // procs to recv from
-    int *num_recv;             // # of datums to recv from each proc
-    int num_self;              // # of datums to copy to self
-    int *index_self;           // list of which datums to copy to self
-    MPI_Request *request;      // MPI requests for posted recvs
-    MPI_Status *status;        // MPI statuses for WaitAll
-  };
+  int *num_recv;             // # of datums to recv from each proc
+  int num_self;              // # of datums to copy to self
+  int *index_self;           // list of which datums to copy to self
 
-  PlanAtom *aplan;
-  PlanData *dplan;
+  // private methods
 
   int create_atom(int, int *, int *, int);
   void exchange_atom(double *, int *, double *);
   void destroy_atom();
+
   int coord2proc(double *, int &, int &, int &);
   int binary(double, int, double *);
 
diff --git a/src/rcb.cpp b/src/rcb.cpp
new file mode 100644
index 0000000000..de3f9f3ba6
--- /dev/null
+++ b/src/rcb.cpp
@@ -0,0 +1,926 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "string.h"
+#include "rcb.h"
+#include "irregular.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MYHUGE 1.0e30
+#define TINY 1.0e-6
+
+// set this to bigger number after debugging
+
+#define DELTA 10
+
+// prototypes for non-class functions
+
+void box_merge(void *, void *, int *, MPI_Datatype *);
+void median_merge(void *, void *, int *, MPI_Datatype *);
+
+// NOTE: if want to have reuse flag, need to sum Tree across procs
+
+/* ---------------------------------------------------------------------- */
+
+RCB::RCB(LAMMPS *lmp) : Pointers(lmp)
+{
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  ndot = maxdot = 0;
+  dots = NULL;
+
+  nlist = maxlist = 0;
+  dotlist = dotmark = NULL;
+
+  maxbuf = 0;
+  buf = NULL;
+
+  maxrecv = maxsend = 0;
+  recvproc = recvindex = sendproc = sendindex = NULL;
+
+  tree = NULL;
+  irregular = NULL;
+
+  // create MPI data and function types for box and median AllReduce ops
+
+  MPI_Type_contiguous(6,MPI_DOUBLE,&box_type);
+  MPI_Type_commit(&box_type);
+  MPI_Type_contiguous(sizeof(Median),MPI_CHAR,&med_type);
+  MPI_Type_commit(&med_type);
+
+  MPI_Op_create(box_merge,1,&box_op);
+  MPI_Op_create(median_merge,1,&med_op);
+
+  reuse = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+RCB::~RCB()
+{
+  memory->sfree(dots);
+  memory->destroy(dotlist);
+  memory->destroy(dotmark);
+  memory->sfree(buf);
+
+  memory->destroy(recvproc);
+  memory->destroy(recvindex);
+  memory->destroy(sendproc);
+  memory->destroy(sendindex);
+
+  memory->sfree(tree);
+  delete irregular;
+
+  MPI_Type_free(&med_type);
+  MPI_Type_free(&box_type);
+  MPI_Op_free(&box_op);
+  MPI_Op_free(&med_op);
+}
+
+/* ----------------------------------------------------------------------
+   perform RCB balancing of N particles at coords X in bounding box LO/HI
+   if wt = NULL, ignore per-particle weights
+   if wt defined, per-particle weights > 0.0
+   dimension = 2 or 3
+   as documented in rcb.h:
+     sets noriginal,nfinal,nkeep,recvproc,recvindex,lo,hi
+   all proc particles will be inside or on surface of 3-d box
+     defined by final lo/hi
+   // NOTE: worry about re-use of data structs for fix balance?
+   // NOTE: could get rid of wt all together, will it be used?
+------------------------------------------------------------------------- */
+
+void RCB::compute(int dimension, int n, double **x, double *wt,
+                  double *bboxlo, double *bboxhi)
+{
+  int i,j,k;
+  int keep,outgoing,incoming,incoming2;
+  int dim,markactive;
+  int indexlo,indexhi;
+  int first_iteration,breakflag;
+  double wttot,wtlo,wthi,wtsum,wtok,wtupto,wtmax;
+  double targetlo,targethi;
+  double valuemin,valuemax,valuehalf;
+  double tolerance;
+  MPI_Comm comm,comm_half;
+  MPI_Request request,request2;
+  MPI_Status status;
+  Median med,medme;
+
+  // create list of my Dots
+
+  ndot = nkeep = noriginal = n;
+
+  if (ndot > maxdot) {
+    maxdot = ndot;
+    memory->sfree(dots);
+    dots = (Dot *) memory->smalloc(ndot*sizeof(Dot),"RCB:dots");
+  }
+
+  for (i = 0; i < ndot; i++) {
+    dots[i].x[0] = x[i][0];
+    dots[i].x[1] = x[i][1];
+    dots[i].x[2] = x[i][2];
+    dots[i].proc = me;
+    dots[i].index = i;
+  }
+
+  if (wt)
+    for (i = 0; i < ndot; i++) dots[i].wt = wt[i];
+  else
+    for (i = 0; i < ndot; i++) dots[i].wt = 1.0;
+
+  // initial bounding box = simulation box
+  // includes periodic or shrink-wrapped boundaries
+  
+  lo = bbox.lo;
+  hi = bbox.hi;
+
+  lo[0] = bboxlo[0];
+  lo[1] = bboxlo[1];
+  lo[2] = bboxlo[2];
+  hi[0] = bboxhi[0];
+  hi[1] = bboxhi[1];
+  hi[2] = bboxhi[2];
+
+  // initialize counters
+
+  counters[0] = 0;
+  counters[1] = 0;
+  counters[2] = 0;
+  counters[3] = ndot;
+  counters[4] = maxdot;
+  counters[5] = 0;
+  counters[6] = 0;
+
+  // create communicator for use in recursion
+
+  MPI_Comm_dup(world,&comm);
+
+  // recurse until partition is a single proc = me
+  // proclower,procupper = lower,upper procs in partition
+  // procmid = 1st proc in upper half of partition
+
+  int procpartner,procpartner2;
+  int readnumber;                   
+
+  int procmid;
+  int proclower = 0;
+  int procupper = nprocs - 1;
+
+  while (proclower != procupper) {
+
+    // if odd # of procs, lower partition gets extra one
+
+    procmid = proclower + (procupper - proclower) / 2 + 1;
+
+    // determine communication partner(s)
+    // readnumber = # of proc partners to read from
+
+    if (me < procmid)
+      procpartner = me + (procmid - proclower);
+    else
+      procpartner = me - (procmid - proclower);
+    
+    int readnumber = 1;
+    if (procpartner > procupper) {
+      readnumber = 0;
+      procpartner--;
+    }
+    if (me == procupper && procpartner != procmid - 1) {
+      readnumber = 2;
+      procpartner2 = procpartner + 1;
+    }
+    
+    // wttot = summed weight of entire partition
+    // search tolerance = largest single weight (plus epsilon)
+    // targetlo = desired weight in lower half of partition
+    // targethi = desired weight in upper half of partition
+
+    wtmax = wtsum = 0.0;
+
+    if (wt) {
+      for (i = 0; i < ndot; i++) {
+        wtsum += dots[i].wt;
+        if (dots[i].wt > wtmax) wtmax = dots[i].wt;
+      }
+    } else {
+      for (i = 0; i < ndot; i++) wtsum += dots[i].wt;
+      wtmax = 1.0;
+    }
+
+    MPI_Allreduce(&wtsum,&wttot,1,MPI_DOUBLE,MPI_SUM,comm);
+    if (wt) MPI_Allreduce(&wtmax,&tolerance,1,MPI_DOUBLE,MPI_MAX,comm);
+    else tolerance = 1.0;
+
+    tolerance *= 1.0 + TINY;
+    targetlo = wttot * (procmid - proclower) / (procupper + 1 - proclower);
+    targethi = wttot - targetlo;
+
+    // dim = dimension to bisect on
+    // do not allow choice of z dimension for 2d system
+
+    dim = 0;
+    if (hi[1]-lo[1] > hi[0]-lo[0]) dim = 1;
+    if (dimension == 3) {
+      if (dim == 0 && hi[2]-lo[2] > hi[0]-lo[0]) dim = 2;
+      if (dim == 1 && hi[2]-lo[2] > hi[1]-lo[1]) dim = 2;
+    }
+    
+    // create active list and mark array for dots
+    // initialize active list to all dots
+
+    if (ndot > maxlist) {
+      memory->destroy(dotlist);
+      memory->destroy(dotmark);
+      maxlist = maxdot;
+      memory->create(dotlist,maxlist,"RCB:dotlist");
+      memory->create(dotmark,maxlist,"RCB:dotmark");
+    }
+
+    nlist = ndot;
+    for (i = 0; i < nlist; i++) dotlist[i] = i;
+
+    // median iteration
+    // zoom in on bisector until correct # of dots in each half of partition
+    // as each iteration of median-loop begins, require:
+    //   all non-active dots are marked with 0/1 in dotmark
+    //   valuemin <= every active dot <= valuemax
+    //   wtlo, wthi = total wt of non-active dots
+    // when leave median-loop, require only:
+    //   valuehalf = correct cut position
+    //   all dots <= valuehalf are marked with 0 in dotmark
+    //   all dots >= valuehalf are marked with 1 in dotmark
+    // markactive = which side of cut is active = 0/1
+    // indexlo,indexhi = indices of dot closest to median
+
+    wtlo = wthi = 0.0;
+    valuemin = lo[dim];
+    valuemax = hi[dim];
+    first_iteration = 1;
+
+    while (1) {
+
+      // choose bisector value
+      // use old value on 1st iteration if old cut dimension is the same
+      // on 2nd option: could push valuehalf towards geometric center 
+      //   with "1.0-factor" to force overshoot
+
+      if (first_iteration && reuse && dim == tree[procmid].dim) {
+	counters[5]++;
+	valuehalf = tree[procmid].cut;
+	if (valuehalf < valuemin || valuehalf > valuemax)
+	  valuehalf = 0.5 * (valuemin + valuemax);	  
+      } else if (wt)
+	valuehalf = valuemin + (targetlo - wtlo) /
+	  (wttot - wtlo - wthi) * (valuemax - valuemin);
+      else
+	valuehalf = 0.5 * (valuemin + valuemax);
+
+      first_iteration = 0;
+      
+      // initialize local median data structure
+
+      medme.totallo = medme.totalhi = 0.0;
+      medme.valuelo = -MYHUGE;
+      medme.valuehi = MYHUGE;
+      medme.wtlo = medme.wthi = 0.0;
+      medme.countlo = medme.counthi = 0;
+      medme.proclo = medme.prochi = me;
+
+      // mark all active dots on one side or other of bisector
+      // also set all fields in median data struct
+      // save indices of closest dots on either side
+
+      for (j = 0; j < nlist; j++) {
+	i = dotlist[j];
+	if (dots[i].x[dim] <= valuehalf) {            // in lower part
+	  medme.totallo += dots[i].wt;
+	  dotmark[i] = 0;
+	  if (dots[i].x[dim] > medme.valuelo) {       // my closest dot
+	    medme.valuelo = dots[i].x[dim];
+	    medme.wtlo = dots[i].wt;
+	    medme.countlo = 1;
+	    indexlo = i;
+	  } else if (dots[i].x[dim] == medme.valuelo) {   // tied for closest
+	    medme.wtlo += dots[i].wt;
+	    medme.countlo++;
+	  }
+	}
+	else {                                        // in upper part
+	  medme.totalhi += dots[i].wt;
+	  dotmark[i] = 1;
+	  if (dots[i].x[dim] < medme.valuehi) {       // my closest dot
+	    medme.valuehi = dots[i].x[dim];
+	    medme.wthi = dots[i].wt;
+	    medme.counthi = 1;
+	    indexhi = i;
+	  } else if (dots[i].x[dim] == medme.valuehi) {   // tied for closest
+	    medme.wthi += dots[i].wt;
+	    medme.counthi++;
+	  }
+	}
+      }
+
+      // combine median data struct across current subset of procs
+
+      counters[0]++;
+      MPI_Allreduce(&medme,&med,1,med_type,med_op,comm);
+
+      // test median guess for convergence
+      // move additional dots that are next to cut across it
+
+      if (wtlo + med.totallo < targetlo) {    // lower half TOO SMALL
+
+	wtlo += med.totallo;
+	valuehalf = med.valuehi;
+
+	if (med.counthi == 1) {                  // only one dot to move
+	  if (wtlo + med.wthi < targetlo) {  // move it, keep iterating
+	    if (me == med.prochi) dotmark[indexhi] = 0;
+	  }
+	  else {                                 // only move if beneficial
+	    if (wtlo + med.wthi - targetlo < targetlo - wtlo)
+	      if (me == med.prochi) dotmark[indexhi] = 0;
+	    break;                               // all done
+	  }
+	}
+	else {                                   // multiple dots to move
+	  breakflag = 0;
+	  wtok = 0.0;
+	  if (medme.valuehi == med.valuehi) wtok = medme.wthi;   
+	  if (wtlo + med.wthi >= targetlo) {                // all done
+	    MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
+	    wtmax = targetlo - wtlo;
+	    if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
+	    breakflag = 1;
+	  }                                      // wtok = most I can move
+	  for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
+	    i = dotlist[j];
+	    if (dots[i].x[dim] == med.valuehi) { // only move if better
+	      if (wtsum + dots[i].wt - wtok < wtok - wtsum)
+		dotmark[i] = 0;
+	      wtsum += dots[i].wt;
+	    }
+	  }
+	  if (breakflag) break;                   // done if moved enough
+	}
+
+	wtlo += med.wthi;
+	if (targetlo-wtlo <= tolerance) break;  // close enough
+
+	valuemin = med.valuehi;                   // iterate again
+	markactive = 1;
+      }
+
+      else if (wthi + med.totalhi < targethi) {  // upper half TOO SMALL
+
+	wthi += med.totalhi;
+	valuehalf = med.valuelo;
+
+	if (med.countlo == 1) {                  // only one dot to move
+	  if (wthi + med.wtlo < targethi) {  // move it, keep iterating
+	    if (me == med.proclo) dotmark[indexlo] = 1;
+	  }
+	  else {                                 // only move if beneficial
+	    if (wthi + med.wtlo - targethi < targethi - wthi)
+	      if (me == med.proclo) dotmark[indexlo] = 1;
+	    break;                               // all done
+	  }
+	}
+	else {                                   // multiple dots to move
+	  breakflag = 0;
+	  wtok = 0.0;
+	  if (medme.valuelo == med.valuelo) wtok = medme.wtlo;   
+	  if (wthi + med.wtlo >= targethi) {                // all done
+	    MPI_Scan(&wtok,&wtupto,1,MPI_DOUBLE,MPI_SUM,comm);
+	    wtmax = targethi - wthi;
+	    if (wtupto > wtmax) wtok = wtok - (wtupto - wtmax);
+	    breakflag = 1;
+	  }                                      // wtok = most I can move
+	  for (j = 0, wtsum = 0.0; j < nlist && wtsum < wtok; j++) {
+	    i = dotlist[j];
+	    if (dots[i].x[dim] == med.valuelo) { // only move if better
+	      if (wtsum + dots[i].wt - wtok < wtok - wtsum) 
+		dotmark[i] = 1;
+	      wtsum += dots[i].wt;
+	    }
+	  }
+	  if (breakflag) break;                   // done if moved enough
+	}
+
+	wthi += med.wtlo;
+	if (targethi-wthi <= tolerance) break;  // close enough
+
+	valuemax = med.valuelo;                   // iterate again
+	markactive = 0;
+      }
+
+      else                  // Goldilocks result: both partitions just right
+	break;
+
+      // shrink the active list
+      
+      k = 0;
+      for (j = 0; j < nlist; j++) {
+	i = dotlist[j];
+	if (dotmark[i] == markactive) dotlist[k++] = i;
+      }
+      nlist = k;
+    }
+
+    // found median
+    // store cut info only if I am procmid
+
+    if (me == procmid) {
+      cut = valuehalf;
+      cutdim = dim;
+    }
+
+    // use cut to shrink my RCB bounding box
+
+    if (me < procmid) hi[dim] = valuehalf;
+    else lo[dim] = valuehalf;
+
+    // outgoing = number of dots to ship to partner
+    // nkeep = number of dots that have never migrated
+
+    markactive = (me < procpartner);
+    for (i = 0, keep = 0, outgoing = 0; i < ndot; i++)
+      if (dotmark[i] == markactive) outgoing++;
+      else if (i < nkeep) keep++;
+    nkeep = keep;
+    
+    // alert partner how many dots I'll send, read how many I'll recv
+
+    MPI_Send(&outgoing,1,MPI_INT,procpartner,0,world);
+    incoming = 0;
+    if (readnumber) {
+      MPI_Recv(&incoming,1,MPI_INT,procpartner,0,world,&status);
+      if (readnumber == 2) {
+	MPI_Recv(&incoming2,1,MPI_INT,procpartner2,0,world,&status);
+	incoming += incoming2;
+      }
+    }
+
+    // check if need to alloc more space
+
+    int ndotnew = ndot - outgoing + incoming;
+    if (ndotnew > maxdot) {
+      while (maxdot < ndotnew) maxdot += DELTA;
+      dots = (Dot *) memory->srealloc(dots,maxdot*sizeof(Dot),"RCB::dots");
+      counters[6]++;
+    }
+
+    counters[1] += outgoing;
+    counters[2] += incoming;
+    if (ndotnew > counters[3]) counters[3] = ndotnew;
+    if (maxdot > counters[4]) counters[4] = maxdot;
+    
+    // malloc comm send buffer
+
+    if (outgoing > maxbuf) {
+      memory->sfree(buf);
+      maxbuf = outgoing;
+      buf = (Dot *) memory->smalloc(maxbuf*sizeof(Dot),"RCB:buf");
+    }
+
+    // fill buffer with dots that are marked for sending
+    // pack down the unmarked ones
+    
+    keep = outgoing = 0;
+    for (i = 0; i < ndot; i++) {
+      if (dotmark[i] == markactive)
+	memcpy(&buf[outgoing++],&dots[i],sizeof(Dot));
+      else
+	memcpy(&dots[keep++],&dots[i],sizeof(Dot));
+    }
+
+    // post receives for dots
+
+    if (readnumber > 0) {
+      MPI_Irecv(&dots[keep],incoming*sizeof(Dot),MPI_CHAR,
+                procpartner,1,world,&request);
+      if (readnumber == 2) {
+	keep += incoming - incoming2;
+	MPI_Irecv(&dots[keep],incoming2*sizeof(Dot),MPI_CHAR,
+                  procpartner2,1,world,&request2);
+      }
+    }
+    
+    // handshake before sending dots to insure recvs have been posted
+    
+    if (readnumber > 0) {
+      MPI_Send(NULL,0,MPI_INT,procpartner,0,world);
+      if (readnumber == 2) MPI_Send(NULL,0,MPI_INT,procpartner2,0,world);
+    }
+    MPI_Recv(NULL,0,MPI_INT,procpartner,0,world,&status);
+
+    // send dots to partner
+
+    MPI_Rsend(buf,outgoing*sizeof(Dot),MPI_CHAR,procpartner,1,world);
+    
+    // wait until all dots are received
+
+    if (readnumber > 0) {
+      MPI_Wait(&request,&status);
+      if (readnumber == 2) MPI_Wait(&request2,&status);
+    }
+
+    ndot = ndotnew;
+
+    // cut partition in half, create new communicators of 1/2 size
+
+    int split;
+    if (me < procmid) {
+      procupper = procmid - 1;
+      split = 0;
+    } else {
+      proclower = procmid;
+      split = 1;
+    }
+
+    MPI_Comm_split(comm,split,me,&comm_half);
+    MPI_Comm_free(&comm);
+    comm = comm_half;
+  }
+
+  // clean up
+
+  MPI_Comm_free(&comm);
+
+  // set public variables with results of rebalance
+
+  nfinal = ndot;
+
+  if (nfinal > maxrecv) {
+    memory->destroy(recvproc);
+    memory->destroy(recvindex);
+    maxrecv = nfinal;
+    memory->create(recvproc,maxrecv,"RCB:recvproc");
+    memory->create(recvindex,maxrecv,"RCB:recvindex");
+  }
+
+  for (i = 0; i < nfinal; i++) {
+    recvproc[i] = dots[i].proc;
+    recvindex[i] = dots[i].index;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   custom MPI reduce operation
+   merge of each component of an RCB bounding box
+------------------------------------------------------------------------- */
+
+void box_merge(void *in, void *inout, int *len, MPI_Datatype *dptr)
+
+{
+  RCB::BBox *box1 = (RCB::BBox *) in;
+  RCB::BBox *box2 = (RCB::BBox *) inout;
+
+  for (int i = 0; i < 3; i++) {
+    if (box1->lo[i] < box2->lo[i]) box2->lo[i] = box1->lo[i];
+    if (box1->hi[i] > box2->hi[i]) box2->hi[i] = box1->hi[i];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   custom MPI reduce operation
+   merge median data structure
+   on input:
+     in,inout->totallo, totalhi = weight in both partitions on this proc
+               valuelo, valuehi = pos of nearest dot(s) to cut on this proc
+               wtlo, wthi       = total wt of dot(s) at that pos on this proc
+               countlo, counthi = # of dot(s) nearest to cut on this proc
+               proclo, prochi   = not used
+   on exit:
+     inout->   totallo, totalhi = total # of active dots in both partitions
+               valuelo, valuehi = pos of nearest dot(s) to cut
+               wtlo, wthi       = total wt of dot(s) at that position
+               countlo, counthi = total # of dot(s) nearest to cut
+               proclo, prochi   = one unique proc who owns a nearest dot
+	                          all procs must get same proclo,prochi
+------------------------------------------------------------------------- */
+
+void median_merge(void *in, void *inout, int *len, MPI_Datatype *dptr)
+
+{
+  RCB::Median *med1 = (RCB::Median *) in;
+  RCB::Median *med2 = (RCB::Median *) inout;
+  
+  med2->totallo += med1->totallo;
+  if (med1->valuelo > med2->valuelo) {
+    med2->valuelo = med1->valuelo;
+    med2->wtlo = med1->wtlo;
+    med2->countlo = med1->countlo;
+    med2->proclo = med1->proclo;
+  }
+  else if (med1->valuelo == med2->valuelo) {
+    med2->wtlo += med1->wtlo;
+    med2->countlo += med1->countlo;
+    if (med1->proclo < med2->proclo) med2->proclo = med1->proclo;
+  }
+
+  med2->totalhi += med1->totalhi;
+  if (med1->valuehi < med2->valuehi) {
+    med2->valuehi = med1->valuehi;
+    med2->wthi = med1->wthi;
+    med2->counthi = med1->counthi;
+    med2->prochi = med1->prochi;
+  }
+  else if (med1->valuehi == med2->valuehi) {
+    med2->wthi += med1->wthi;
+    med2->counthi += med1->counthi;
+    if (med1->prochi < med2->prochi) med2->prochi = med1->prochi;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   invert the RCB rebalance result to convert receive info into send info
+   sortflag = flag for sorting order of received messages by proc ID
+------------------------------------------------------------------------- */
+
+void RCB::invert(int sortflag)
+{
+  Invert *sbuf,*rbuf;
+
+  // only create Irregular if not previously created
+  // allows Irregular to persist for multiple RCB calls by fix balance
+
+  if (!irregular) irregular = new Irregular(lmp);
+
+  // nsend = # of dots to request from other procs
+
+  int nsend = nfinal-nkeep;
+
+  int *proclist;
+  memory->create(proclist,nsend,"RCB:proclist");
+
+  Invert *sinvert = 
+    (Invert *) memory->smalloc(nsend*sizeof(Invert),"RCB:sinvert");
+
+  int m = 0;
+  for (int i = nkeep; i < nfinal; i++) {
+    proclist[m] = recvproc[i];
+    sinvert[m].rindex = recvindex[i];
+    sinvert[m].sproc = me;
+    sinvert[m].sindex = i;
+    m++;
+  }
+  
+  // perform inversion via irregular comm
+  // nrecv = # of my dots to send to other procs
+
+  int nrecv = irregular->create_data(nsend,proclist,sortflag);
+  Invert *rinvert = 
+    (Invert *) memory->smalloc(nrecv*sizeof(Invert),"RCB:rinvert");
+  irregular->exchange_data((char *) sinvert,sizeof(Invert),(char *) rinvert);
+  irregular->destroy_data();
+
+  // set public variables from requests to send my dots
+
+  if (noriginal > maxsend) {
+    memory->destroy(sendproc);
+    memory->destroy(sendindex);
+    maxsend = noriginal;
+    memory->create(sendproc,maxsend,"RCB:sendproc");
+    memory->create(sendindex,maxsend,"RCB:sendindex");
+  }
+
+  for (int i = 0; i < nkeep; i++) {
+    sendproc[recvindex[i]] = me;
+    sendindex[recvindex[i]] = i;
+  }
+
+  for (int i = 0; i < nrecv; i++) {
+    m = rinvert[i].rindex;
+    sendproc[m] = rinvert[i].sproc;
+    sendindex[m] = rinvert[i].sindex;
+  }
+
+  // clean-up
+
+  memory->destroy(proclist);
+  memory->destroy(sinvert);
+  memory->destroy(rinvert);
+}
+
+/* ----------------------------------------------------------------------
+   memory use of Irregular
+------------------------------------------------------------------------- */
+
+bigint RCB::memory_usage()
+{
+  bigint bytes = 0;
+  if (irregular) bytes += irregular->memory_usage();
+  return bytes;
+}
+
+// -----------------------------------------------------------------------
+// DEBUG methods
+// -----------------------------------------------------------------------
+/*
+// consistency checks on RCB results
+
+void RCB::check()
+{
+  int i,iflag,total1,total2;
+  double weight,wtmax,wtmin,wtone,tolerance;
+
+  // check that total # of dots remained the same
+
+  MPI_Allreduce(&ndotorig,&total1,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&ndot,&total2,1,MPI_INT,MPI_SUM,world);
+  if (total1 != total2) {
+    if (me == 0) 
+      printf("ERROR: Points before RCB = %d, Points after RCB = %d\n",
+	     total1,total2);
+  }
+  
+  // check that result is load-balanced within log2(P)*max-wt
+
+  weight = wtone = 0.0;
+  for (i = 0; i < ndot; i++) {
+    weight += dots[i].wt;
+    if (dots[i].wt > wtone) wtone = dots[i].wt;
+  }
+
+  MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
+  MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
+  MPI_Allreduce(&wtone,&tolerance,1,MPI_DOUBLE,MPI_MAX,world);
+
+  // i = smallest power-of-2 >= nprocs
+  // tolerance = largest-single-weight*log2(nprocs)
+
+  for (i = 0; (nprocs >> i) != 0; i++);
+  tolerance = tolerance * i * (1.0 + TINY);
+
+  if (wtmax - wtmin > tolerance) {
+    if (me == 0) 
+      printf("ERROR: Load-imbalance > tolerance of %g\n",tolerance);
+    MPI_Barrier(world);
+    if (weight == wtmin) printf("  Proc %d has weight = %g\n",me,weight);
+    if (weight == wtmax) printf("  Proc %d has weight = %g\n",me,weight);
+  }
+  
+  MPI_Barrier(world);
+  
+  // check that final set of points is inside RCB box of each proc
+  
+  iflag = 0;
+  for (i = 0; i < ndot; i++) {
+    if (dots[i].x[0] < lo[0] || dots[i].x[0] > hi[0] ||
+	dots[i].x[1] < lo[1] || dots[i].x[1] > hi[1] ||
+	dots[i].x[2] < lo[2] || dots[i].x[2] > hi[2])
+      iflag++;
+  }
+  if (iflag > 0) 
+    printf("ERROR: %d points are out-of-box on proc %d\n",iflag,me);
+}
+
+// stats for RCB decomposition
+
+void RCB::stats(int flag)
+{
+  int i,iflag,sum,min,max;
+  double ave,rsum,rmin,rmax;
+  double weight,wttot,wtmin,wtmax;
+
+  if (me == 0) printf("RCB Statistics:\n");
+
+  // distribution info
+  
+  for (i = 0, weight = 0.0; i < ndot; i++) weight += dots[i].wt;
+  MPI_Allreduce(&weight,&wttot,1,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(&weight,&wtmin,1,MPI_DOUBLE,MPI_MIN,world);
+  MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
+
+  if (me == 0) {
+    printf(" Total weight of dots = %g\n",wttot);
+    printf(" Weight on each proc: ave = %g, max = %g, min = %g\n",
+	   wttot/nprocs,wtmax,wtmin);
+  }
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d has weight = %g\n",me,weight);
+  }
+
+  for (i = 0, weight = 0.0; i < ndot; i++) 
+    if (dots[i].wt > weight) weight = dots[i].wt;
+  MPI_Allreduce(&weight,&wtmax,1,MPI_DOUBLE,MPI_MAX,world);
+  
+  if (me == 0) printf(" Maximum weight of single dot = %g\n",wtmax);
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d max weight = %g\n",me,weight);
+  }
+
+  // counter info
+
+  MPI_Allreduce(&counters[0],&sum,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&counters[0],&min,1,MPI_INT,MPI_MIN,world);
+  MPI_Allreduce(&counters[0],&max,1,MPI_INT,MPI_MAX,world);
+  ave = ((double) sum)/nprocs;
+  if (me == 0) 
+    printf(" Median iter: ave = %g, min = %d, max = %d\n",ave,min,max);
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d median count = %d\n",me,counters[0]);
+  }
+
+  MPI_Allreduce(&counters[1],&sum,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&counters[1],&min,1,MPI_INT,MPI_MIN,world);
+  MPI_Allreduce(&counters[1],&max,1,MPI_INT,MPI_MAX,world);
+  ave = ((double) sum)/nprocs;
+  if (me == 0) 
+    printf(" Send count: ave = %g, min = %d, max = %d\n",ave,min,max);
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d send count = %d\n",me,counters[1]);
+  }
+
+  MPI_Allreduce(&counters[2],&sum,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&counters[2],&min,1,MPI_INT,MPI_MIN,world);
+  MPI_Allreduce(&counters[2],&max,1,MPI_INT,MPI_MAX,world);
+  ave = ((double) sum)/nprocs;
+  if (me == 0) 
+    printf(" Recv count: ave = %g, min = %d, max = %d\n",ave,min,max);
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d recv count = %d\n",me,counters[2]);
+  }
+
+  MPI_Allreduce(&counters[3],&sum,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&counters[3],&min,1,MPI_INT,MPI_MIN,world);
+  MPI_Allreduce(&counters[3],&max,1,MPI_INT,MPI_MAX,world);
+  ave = ((double) sum)/nprocs;
+  if (me == 0) 
+    printf(" Max dots: ave = %g, min = %d, max = %d\n",ave,min,max);
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d max dots = %d\n",me,counters[3]);
+  }
+
+  MPI_Allreduce(&counters[4],&sum,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&counters[4],&min,1,MPI_INT,MPI_MIN,world);
+  MPI_Allreduce(&counters[4],&max,1,MPI_INT,MPI_MAX,world);
+  ave = ((double) sum)/nprocs;
+  if (me == 0) 
+    printf(" Max memory: ave = %g, min = %d, max = %d\n",ave,min,max);
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d max memory = %d\n",me,counters[4]);
+  }
+
+  if (reuse) {
+    MPI_Allreduce(&counters[5],&sum,1,MPI_INT,MPI_SUM,world);
+    MPI_Allreduce(&counters[5],&min,1,MPI_INT,MPI_MIN,world);
+    MPI_Allreduce(&counters[5],&max,1,MPI_INT,MPI_MAX,world);
+    ave = ((double) sum)/nprocs;
+    if (me == 0) 
+      printf(" # of Reuse: ave = %g, min = %d, max = %d\n",ave,min,max);
+    if (flag) {
+      MPI_Barrier(world);
+      printf("    Proc %d # of Reuse = %d\n",me,counters[5]);
+    }
+  }
+  
+  MPI_Allreduce(&counters[6],&sum,1,MPI_INT,MPI_SUM,world);
+  MPI_Allreduce(&counters[6],&min,1,MPI_INT,MPI_MIN,world);
+  MPI_Allreduce(&counters[6],&max,1,MPI_INT,MPI_MAX,world);
+  ave = ((double) sum)/nprocs;
+  if (me == 0) 
+    printf(" # of OverAlloc: ave = %g, min = %d, max = %d\n",ave,min,max);
+  if (flag) {
+    MPI_Barrier(world);
+    printf("    Proc %d # of OverAlloc = %d\n",me,counters[6]);
+  }
+
+  // RCB boxes for each proc
+  
+  if (flag) {
+    if (me == 0) printf(" RCB sub-domain boxes:\n");
+    for (i = 0; i < 3; i++) {
+      MPI_Barrier(world);
+      if (me == 0) printf("    Dimension %d\n",i+1);
+      MPI_Barrier(world);
+      printf("      Proc = %d: Box = %g %g\n",me,lo[i],hi[i]);
+    }
+  }
+}
+*/
diff --git a/src/rcb.h b/src/rcb.h
new file mode 100644
index 0000000000..4c2e3809e9
--- /dev/null
+++ b/src/rcb.h
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LAMMPS_RCB_H
+#define LAMMPS_RCB_H
+
+#include "mpi.h"
+#include "pointers.h"
+
+namespace LAMMPS_NS {
+
+class RCB : protected Pointers {
+ public:
+  // set by compute()
+
+  int noriginal;              // # of dots I own before balancing
+  int nfinal;                 // # of dots I own after balancing
+  int nkeep;                  // how many dots of noriginal I still own
+                              // will be first nkept of nfinal list
+  int *recvproc;              // proc IDs of nfinal dots
+  int *recvindex;             // index of nfinal dots on owning procs
+                              // based on input list for compute()
+  double *lo,*hi;             // final bounding box of my RCB sub-domain
+  double cut;                 // single cut (in Tree) owned by this proc
+  int cutdim;                 // dimension (0,1,2) of the cut
+
+  // set by invert()
+
+  int *sendproc;              // proc to send each of my noriginal dots to
+  int *sendindex;             // index of dot in receiver's nfinal list
+
+  RCB(class LAMMPS *);
+  ~RCB();
+  void compute(int, int, double **, double *, double *, double *);
+  void invert(int sortflag = 0);
+  bigint memory_usage();
+
+  // DEBUG methods
+  //void check();
+  //void stats(int);
+
+  // RCB cut info
+  
+  struct Median {
+    double totallo,totalhi;   // weight in each half of active partition
+    double valuelo,valuehi;   // position of dot(s) nearest to cut
+    double wtlo,wthi;         // total weight of dot(s) at that position
+    int countlo,counthi;      // # of dots at that position
+    int proclo,prochi;	      // unique proc who owns a nearest dot
+  };
+
+  struct BBox {
+    double lo[3],hi[3];       // corner points of a bounding box
+  };
+  
+ private:
+  int me,nprocs;
+
+  // point to balance on
+
+  struct Dot {
+    double x[3];          // coord of point
+    double wt;            // weight of point
+    int proc;             // owning proc
+    int index;            // index on owning proc
+  };
+
+  // tree of RCB cuts
+
+  struct Tree {
+    double cut;        	// position of cut
+    int dim;	        // dimension = 0/1/2 of cut
+  };
+  
+  // inversion message
+
+  struct Invert {
+    int rindex;         // index on receiving proc
+    int sproc;          // sending proc
+    int sindex;         // index on sending proc
+  };
+
+  Dot *dots;        // dots on this proc
+  int ndot;         // # of dots on this proc
+  int maxdot;       // allocated size of dots
+  int ndotorig;
+
+  int nlist;
+  int maxlist;
+  int *dotlist;
+  int *dotmark;
+
+  int maxbuf;
+  Dot *buf;
+
+  int maxrecv,maxsend;
+
+  BBox bbox;
+  class Irregular *irregular;
+
+  MPI_Op box_op,med_op;
+  MPI_Datatype box_type,med_type;
+
+  int reuse;        // 1/0 to use/not use previous cuts
+  int dottop;       // dots >= this index are new
+  double bboxlo[3]; // bounding box of final RCB sub-domain
+  double bboxhi[3]; 
+  Tree *tree;       // tree of RCB cuts, used by reuse()
+  int counters[7];  // diagnostic counts
+		    // 0 = # of median iterations
+		    // 1 = # of points sent
+		    // 2 = # of points received
+		    // 3 = most points this proc ever owns
+		    // 4 = most point memory this proc ever allocs
+		    // 5 = # of times a previous cut is re-used
+		    // 6 = # of reallocs of point vector
+};
+
+}
+
+#endif
diff --git a/src/replicate.cpp b/src/replicate.cpp
index b759c47fa3..26f3fca7ed 100644
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@@ -29,6 +29,8 @@ using namespace LAMMPS_NS;
 #define LB_FACTOR 1.1
 #define EPSILON   1.0e-6
 
+enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
+
 /* ---------------------------------------------------------------------- */
 
 Replicate::Replicate(LAMMPS *lmp) : Pointers(lmp) {}
@@ -220,17 +222,33 @@ void Replicate::command(int narg, char **arg)
     sublo[2] = domain->sublo_lamda[2]; subhi[2] = domain->subhi_lamda[2];
   }
 
-  if (domain->xperiodic) {
-    if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
-    if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
-  }
-  if (domain->yperiodic) {
-    if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
-    if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
-  }
-  if (domain->zperiodic) {
-    if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
-    if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
+  if (comm->layout != LAYOUT_TILED) {
+    if (domain->xperiodic) {
+      if (comm->myloc[0] == 0) sublo[0] -= epsilon[0];
+      if (comm->myloc[0] == comm->procgrid[0]-1) subhi[0] += epsilon[0];
+    }
+    if (domain->yperiodic) {
+      if (comm->myloc[1] == 0) sublo[1] -= epsilon[1];
+      if (comm->myloc[1] == comm->procgrid[1]-1) subhi[1] += epsilon[1];
+    }
+    if (domain->zperiodic) {
+      if (comm->myloc[2] == 0) sublo[2] -= epsilon[2];
+      if (comm->myloc[2] == comm->procgrid[2]-1) subhi[2] += epsilon[2];
+    }
+
+  } else {
+    if (domain->xperiodic) {
+      if (comm->mysplit[0][0] == 0.0) sublo[0] -= epsilon[0];
+      if (comm->mysplit[0][1] == 1.0) subhi[0] += epsilon[0];
+    }
+    if (domain->yperiodic) {
+      if (comm->mysplit[1][0] == 0.0) sublo[1] -= epsilon[1];
+      if (comm->mysplit[1][1] == 1.0) subhi[1] += epsilon[1];
+    }
+    if (domain->zperiodic) {
+      if (comm->mysplit[2][0] == 0.0) sublo[2] -= epsilon[2];
+      if (comm->mysplit[2][1] == 1.0) subhi[2] += epsilon[2];
+    }
   }
 
   // loop over all procs
diff --git a/src/variable.cpp b/src/variable.cpp
index 6efe9ef3f9..f371512128 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -44,6 +44,7 @@ using namespace MathConst;
 #define MAXLEVEL 4
 #define MAXLINE 256
 #define CHUNK 1024
+#define VALUELENGTH 64
 
 #define MYROUND(a) (( a-floor(a) ) >= .5) ? ceil(a) : floor(a)
 
@@ -306,7 +307,7 @@ void Variable::set(int narg, char **arg)
     pad[nvar] = 0;
     data[nvar] = new char*[num[nvar]];
     copy(1,&arg[2],data[nvar]);
-    data[nvar][1] = NULL;
+    data[nvar][1] = new char[VALUELENGTH];
 
   // SCALARFILE for strings or numbers
   // which = 1st value
@@ -360,7 +361,7 @@ void Variable::set(int narg, char **arg)
     pad[nvar] = 0;
     data[nvar] = new char*[num[nvar]];
     copy(2,&arg[2],data[nvar]);
-    data[nvar][2] = NULL;
+    data[nvar][2] = new char[VALUELENGTH];
 
   // EQUAL
   // replace pre-existing var if also style EQUAL (allows it to be reset)
@@ -374,9 +375,7 @@ void Variable::set(int narg, char **arg)
       if (style[ivar] != EQUAL)
         error->all(FLERR,"Cannot redefine variable as a different style");
       delete [] data[ivar][0];
-      if (data[ivar][1]) delete [] data[ivar][1];
       copy(1,&arg[2],data[ivar]);
-      data[ivar][1] = NULL;
       replaceflag = 1;
     } else {
       if (nvar == maxvar) grow();
@@ -386,7 +385,7 @@ void Variable::set(int narg, char **arg)
       pad[nvar] = 0;
       data[nvar] = new char*[num[nvar]];
       copy(1,&arg[2],data[nvar]);
-      data[nvar][1] = NULL;
+      data[nvar][1] = new char[VALUELENGTH];
     }
 
   // ATOM
@@ -625,32 +624,24 @@ char *Variable::retrieve(char *name)
     strcpy(data[ivar][0],result);
     str = data[ivar][0];
   } else if (style[ivar] == EQUAL) {
-    char result[64];
     double answer = evaluate(data[ivar][0],NULL);
-    sprintf(result,"%.15g",answer);
-    int n = strlen(result) + 1;
-    if (data[ivar][1]) delete [] data[ivar][1];
-    data[ivar][1] = new char[n];
-    strcpy(data[ivar][1],result);
+    sprintf(data[ivar][1],"%.15g",answer);
     str = data[ivar][1];
   } else if (style[ivar] == FORMAT) {
-    char result[64];
     int jvar = find(data[ivar][0]);
     if (jvar == -1) return NULL;
     if (!equalstyle(jvar)) return NULL;
     double answer = evaluate(data[jvar][0],NULL);
-    sprintf(result,data[ivar][1],answer);
-    int n = strlen(result) + 1;
-    if (data[ivar][2]) delete [] data[ivar][2];
-    data[ivar][2] = new char[n];
-    strcpy(data[ivar][2],result);
+    sprintf(data[ivar][2],data[ivar][1],answer);
     str = data[ivar][2];
   } else if (style[ivar] == GETENV) {
     const char *result = getenv(data[ivar][0]);
-    if (data[ivar][1]) delete [] data[ivar][1];
-    if (result == NULL) result = (const char *)"";
+    if (result == NULL) result = (const char *) "";
     int n = strlen(result) + 1;
-    data[ivar][1] = new char[n];
+    if (n > VALUELENGTH) {
+      delete [] data[ivar][1];
+      data[ivar][1] = new char[n];
+    }
     strcpy(data[ivar][1],result);
     str = data[ivar][1];
   } else if (style[ivar] == ATOM || style[ivar] == ATOMFILE) return NULL;
@@ -782,6 +773,45 @@ int Variable::atomstyle(int ivar)
   return 0;
 }
 
+/* ----------------------------------------------------------------------
+   save copy of EQUAL style ivar formula in copy
+   allocate copy here, later equal_restore() call will free it
+   insure data[ivar][0] is of VALUELENGTH since will be overridden
+------------------------------------------------------------------------- */
+
+void Variable::equal_save(int ivar, char *&copy)
+{
+  int n = strlen(data[ivar][0]) + 1;
+  copy = new char[n];
+  strcpy(copy,data[ivar][0]);
+  delete [] data[ivar][0];
+  data[ivar][0] = new char[VALUELENGTH];
+}
+
+/* ----------------------------------------------------------------------
+   restore formula string of EQUAL style ivar from copy
+   then free copy, allocated in equal_save()
+------------------------------------------------------------------------- */
+
+void Variable::equal_restore(int ivar, char *copy)
+{
+  delete [] data[ivar][0];
+  int n = strlen(copy) + 1;
+  data[ivar][0] = new char[n];
+  strcpy(data[ivar][0],copy);
+  delete [] copy;
+}
+
+/* ----------------------------------------------------------------------
+   override EQUAL style ivar formula with value converted to string
+   data[ivar][0] was set to length 64 in equal_save()
+------------------------------------------------------------------------- */
+
+void Variable::equal_override(int ivar, double value)
+{
+  sprintf(data[ivar][0],"%.15g",value);
+}
+
 /* ----------------------------------------------------------------------
    remove Nth variable from list and compact list
    delete reader explicitly if it exists
diff --git a/src/variable.h b/src/variable.h
index bf6dcea656..f04a04f9d4 100644
--- a/src/variable.h
+++ b/src/variable.h
@@ -36,10 +36,15 @@ class Variable : protected Pointers {
   int int_between_brackets(char *&, int);
   double evaluate_boolean(char *);
 
+  void equal_save(int, char *&);
+  void equal_restore(int, char *);
+  void equal_override(int, double);
+
   unsigned int data_mask(int ivar);
   unsigned int data_mask(char *str);
 
  private:
+  int me;
   int nvar;                // # of defined variables
   int maxvar;              // max # of variables following lists can hold
   char **names;            // name of each variable
@@ -57,7 +62,6 @@ class Variable : protected Pointers {
 
   int precedence[17];      // precedence level of math operators
                            // set length to include up to OR in enum
-  int me;
 
   struct Tree {            // parse tree for atom-style variables
     double value;          // single scalar  
diff --git a/src/version.h b/src/version.h
index 25d90d9612..b4d684523a 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "22 Jul 2014"
+#define LAMMPS_VERSION "29 Jul 2014"