From c46be7db625efcdb7943eb14c6e577488d349518 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Wed, 5 Oct 2016 10:33:39 -0600
Subject: [PATCH] changes to imbalance weight factors

---
 doc/src/Manual.txt      |   4 +-
 doc/src/balance.txt     | 108 ++++++++++++++++++++++++----------------
 doc/src/dump_modify.txt |  57 ++++++++++++++++++---
 examples/body/in.body   |  10 +++-
 src/angle.cpp           |   3 --
 src/angle.h             |   6 +--
 src/atom.cpp            |   3 --
 src/atom.h              |   5 --
 src/atom_vec.h          |   4 --
 src/bond.cpp            |   3 --
 src/bond.h              |   6 +--
 src/comm_tiled.h        |   4 --
 src/compute.cpp         |   3 --
 src/compute.h           |   6 ---
 src/dihedral.cpp        |   3 --
 src/dihedral.h          |   6 +--
 src/dump_custom.cpp     |   8 ++-
 src/fix.cpp             |   5 +-
 src/fix.h               |   8 ---
 src/imbalance_group.cpp |  12 ++---
 src/imbalance_neigh.cpp |  54 +++++++++++---------
 src/imbalance_time.cpp  |  85 +++++++++++++++++++------------
 src/imbalance_var.cpp   |  16 +++---
 src/improper.cpp        |   3 --
 src/improper.h          |   6 +--
 src/input.h             |   6 ---
 src/kspace.cpp          |   3 --
 src/kspace.h            |   4 +-
 src/lammps.h            |   9 ----
 src/pair.cpp            |   3 --
 src/pair.h              |   6 ---
 src/suffix.h            |   5 +-
 src/update.h            |   8 ---
 src/variable.cpp        |  66 ------------------------
 src/variable.h          |   3 --
 src/version.h           |   2 +-
 36 files changed, 241 insertions(+), 302 deletions(-)
diff --git a/doc/src/Manual.txt b/doc/src/Manual.txt
index d51508236a..bb0bd25670 100644
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="30 Sep 2016 version">
+<META NAME="docnumber" CONTENT="5 Oct 2016 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -21,7 +21,7 @@
 <H1></H1>
 
 LAMMPS Documentation :c,h3
-30 Sep 2016 version :c,h4
+5 Oct 2016 version :c,h4
 
 Version info: :h4
 
diff --git a/doc/src/balance.txt b/doc/src/balance.txt
index 635e7a9b3d..194fce92c9 100644
--- a/doc/src/balance.txt
+++ b/doc/src/balance.txt
@@ -319,24 +319,25 @@ accurately would be impractical and slow down the computation.
 Instead the {weight} keyword implements several ways to influence the
 per-particle weights empirically by properties readily available or
 using the user's knowledge of the system.  Note that the absolute
-value of the weights are not important; their ratio is what is used to
-assign particles to processors.  A particle with a weight of 2.5 is
-assumed to require 5x more computational than a particle with a weight
-of 0.5.
+value of the weights are not important; only their relative ratios
+affect which particle is assigned to which processor.  A particle with
+a weight of 2.5 is assumed to require 5x more computational than a
+particle with a weight of 0.5.  For all the options below the weight
+assigned to a particle must be a positive value; an error will be be
+generated if a weight is <= 0.0.
 
 Below is a list of possible weight options with a short description of
 their usage and some example scenarios where they might be applicable.
-It is possible to apply multiple weight flags and the weightins they
+It is possible to apply multiple weight flags and the weightings they
 induce will be combined through multiplication.  Most of the time,
 however, it is sufficient to use just one method.
 
 The {group} weight style assigns weight factors to specified
 "groups"_group.html of particles.  The {group} style keyword is
 followed by the number of groups, then pairs of group IDs and the
-corresponding weight factor. If a particle belongs to none of the
+corresponding weight factor.  If a particle belongs to none of the
 specified groups, its weight is not changed.  If it belongs to
 multiple groups, its weight is the product of the weight factors.
-The weight factors have to be positive.
 
 This weight style is useful in combination with pair style
 "hybrid"_pair_hybrid.html, e.g. when combining a more costly manybody
@@ -347,14 +348,24 @@ the computational cost for each group remains constant over time.
 This is a purely empirical weighting, so a series test runs to tune
 the assigned weight factors for optimal performance is recommended.
 
-The {neigh} weight style assigns a weight to each particle equal to
-its number of neighbors divided by the avergage number of neighbors
-for all particles.  The {factor} setting is then appied as an overall
-scale factor to all the {neigh} weights which allows tuning of the
-impact of this style.  A {factor} smaller than 1.0 (e.g. 0.8) often
-results in the best performance, since the number of neighbors is
-likely to overestimate the ideal weight. The factor has to be between
-0.0 and 2.0.
+The {neigh} weight style assigns the same weight to each particle
+owned by a processor based on the total count of neighbors in the
+neighbor list owned by that processor.  The motivation is that more
+neighbors means a higher computational cost.  The style does not use
+neighbors per atom to assign a unique weight to each atom, because
+that value can vary depending on how the neighbor list is built.
+
+The {factor} setting is applied as an overall scale factor to the
+{neigh} weights which allows adjustment of their impact on the
+balancing operation.  The specified {factor} value must be positive.
+A value > 1.0 will increase the weights so that the ratio of max
+weight to min weight increases by {factor}.  A value < 1.0 will
+decrease the weights so that the ratio of max weight to min weight
+decreases by {factor}.  In both cases the intermediate weight values
+increase/decrease proportionally as well.  A value = 1.0 has no effect
+on the {neigh} weights.  As a rule of thumb, we have found a {factor}
+of about 0.8 often results in the best performance, since the number
+of neighbors is likely to overestimate the ideal weight.
 
 This weight style is useful for systems where there are different
 cutoffs used for different pairs of interations, or the density
@@ -370,35 +381,48 @@ weights are computed.  Inserting a "run 0 post no"_run.html command
 before issuing the {balance} command, may be a workaround for this
 case, as it will induce the neighbor list to be built.
 
-The {time} weight style uses "timer data"_timer.html to estimate a
-weight for each particle.  It uses the same information as is used for
-the "MPI task timing breakdown"_Section_start.html#start_8, namely,
-the timings for sections {Pair}, {Bond}, {Kspace}, and {Neigh}.  The
-time spent in these sections of the timestep are measured for each MPI
-rank, summed up, then converted into a cost for each MPI rank relative
-to the average cost over all MPI ranks for the same sections.  That
-cost then evenly distributed over all the particles owned by that
-rank.  Finally, the {factor} setting is then appied as an overall
-scale factor to all the {time} weights as a way to fine tune the
-impact of this weight style.  Good {factor} values to use are
-typically between 0.5 and 1.2. Allowed are values between 0.0 and 2.0.
+The {time} weight style uses "timer data"_timer.html to estimate
+weights.  It assigns the same weight to each particle owned by a
+processor based on the total computational time spent by that
+processor.  See details below on what time window is used.  It uses
+the same timing information as is used for the "MPI task timing
+breakdown"_Section_start.html#start_8, namely, for sections {Pair},
+{Bond}, {Kspace}, and {Neigh}.  The time spent in those portions of
+the timestep are measured for each MPI rank, summed, then divided by
+the number of particles owned by that processor.  I.e. the weight is
+an effective CPU time/particle averaged over the particles on that
+processor.
 
-For the {balance} command the timing data is taken from the preceding
-run command, i.e. the timings are for the entire previous run.  For
-the {fix balance} command the timing data is for only the timesteps
-since the last balancing operation was performed.  If timing
-information for the required sections is not available, e.g. at the
-beginning of a run, or when the "timer"_timer.html command is set to
-either {loop} or {off}, a warning is issued.  In this case no weights
-are computed.
+The {factor} setting is applied as an overall scale factor to the
+{time} weights which allows adjustment of their impact on the
+balancing operation.  The specified {factor} value must be positive.
+A value > 1.0 will increase the weights so that the ratio of max
+weight to min weight increases by {factor}.  A value < 1.0 will
+decrease the weights so that the ratio of max weight to min weight
+decreases by {factor}.  In both cases the intermediate weight values
+increase/decrease proportionally as well.  A value = 1.0 has no effect
+on the {time} weights.  As a rule of thumb, effective values to use
+are typicall between 0.5 and 1.2.  Note that the timer quantities
+mentioned above can be affected by communication which occurs in the
+middle of the operations, e.g. pair styles with intermediate exchange
+of data witin the force computation, and likewise for KSpace solves.
 
-This weight style is the most generic one, and should be tried first,
-if neither the {group} or {neigh} styles are easily applicable.
-However, since the computed cost function is averaged over all local
-particles this weight style may not be highly accurate.  This style
-can also be effective as a secondary weight in combination with either
-{group} or {neigh} to offset some of inaccuracies in either of those
-heuristics.
+When using the {time} weight style with the {balance} command, the
+timing data is taken from the preceding run command, i.e. the timings
+are for the entire previous run.  For the {fix balance} command the
+timing data is for only the timesteps since the last balancing
+operation was performed.  If timing information for the required
+sections is not available, e.g. at the beginning of a run, or when the
+"timer"_timer.html command is set to either {loop} or {off}, a warning
+is issued.  In this case no weights are computed.
+
+NOTE: The {time} weight style is the most generic option, and should
+be tried first, unless the {group} style is easily applicable.
+However, since the computed cost function is averaged over all
+particles on a processor, the weights may not be highly accurate.
+This style can also be effective as a secondary weight in combination
+with either {group} or {neigh} to offset some of inaccuracies in
+either of those heuristics.
 
 The {var} weight style assigns per-particle weights by evaluating an
 "atom-style variable"_variable.html specified by {name}.  This is
diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt
index 00734a5975..c236aa284d 100644
--- a/doc/src/dump_modify.txt
+++ b/doc/src/dump_modify.txt
@@ -49,8 +49,8 @@ keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} o
      -N = sort per-atom lines in descending order by the Nth column
   {thresh} args = attribute operation value
     attribute = same attributes (x,fy,etotal,sxx,etc) used by dump custom style
-    operation = "<" or "<=" or ">" or ">=" or "==" or "!="
-    value = numeric value to compare to
+    operation = "<" or "<=" or ">" or ">=" or "==" or "!=" or "|^"
+    value = numeric value to compare to, or LAST
     these 3 args can be replaced by the word "none" to turn off thresholding
   {unwrap} arg = {yes} or {no} :pre
 these keywords apply only to the {image} and {movie} "styles"_dump_image.html :l
@@ -458,16 +458,59 @@ as well as memory, versus unsorted output.
 
 The {thresh} keyword only applies to the dump {custom}, {cfg},
 {image}, and {movie} styles.  Multiple thresholds can be specified.
-Specifying "none" turns off all threshold criteria.  If thresholds are
+Specifying {none} turns off all threshold criteria.  If thresholds are
 specified, only atoms whose attributes meet all the threshold criteria
 are written to the dump file or included in the image.  The possible
 attributes that can be tested for are the same as those that can be
 specified in the "dump custom"_dump.html command, with the exception
 of the {element} attribute, since it is not a numeric value.  Note
-that different attributes can be output by the dump custom command
-than are used as threshold criteria by the dump_modify command.
-E.g. you can output the coordinates and stress of atoms whose energy
-is above some threshold.
+that a different attributes can be used than those output by the "dump
+custom"_dump.html command.  E.g. you can output the coordinates and
+stress of atoms whose energy is above some threshold.
+
+If an atom-style variable is used as the attribute, then it can
+produce continuous numeric values or effective Boolean 0/1 values
+which may be useful for the comparision operation.  Boolean values can
+be generated by variable formulas that use comparison or Boolean math
+operators or special functions like gmask() and rmask() and grmask().
+See the "variable"_variable.html command doc page for details.
+
+NOTE: The LAST option, discussed below, is not yet implemented.  It
+will be soon.
+
+The specified value must be a simple numeric value or the word LAST.
+If LAST is used, it refers to the value of the attribute the last time
+the dump command was invoked to produce a snapshot.  This is a way to
+only dump atoms whose attribute has changed (or not changed).
+Three examples follow.
+
+dump_modify ... thresh ix != LAST :pre
+
+This will dump atoms which have crossed the periodic x boundary of the
+simulation box since the last dump.  (Note that atoms that crossed
+once and then crossed back between the two dump timesteps would not be
+included.)
+
+region foo sphere 10 20 10 15 
+variable inregion atom rmask(foo)
+dump_modify ... thresh v_inregion |^ LAST
+
+This will dump atoms which crossed the boundary of the spherical
+region since the last dump.
+
+variable charge atom "(q > 0.5) || (q < -0.5)"
+dump_modify ... thresh v_charge |^ LAST
+
+This will dump atoms whose charge has changed from an absolute value
+less than 1/2 to greater than 1/2 (or vice versa) since the last dump.
+E.g. due to reactions and subsequent charge equilibration in a
+reactive force field.
+
+The choice of operations are the usual comparison operators.  The XOR
+operation (exclusive or) is also included as "|^".  In this context,
+XOR means that if either the attribute or value is 0.0 and the other
+is non-zero, then the result is "true" and the threshold criterion is
+met.  Otherwise it is not met.
 
 :line
 
diff --git a/examples/body/in.body b/examples/body/in.body
index 604f0fbc89..5879ed5e45 100644
--- a/examples/body/in.body
+++ b/examples/body/in.body
@@ -11,13 +11,19 @@ velocity	all create 1.44 87287 loop geom
 pair_style	body 5.0
 pair_coeff	* * 1.0 1.0
 
-neighbor	0.3 bin
+neighbor	0.5 bin
+neigh_modify    every 1 delay 0 check yes
 
 fix		1 all nve/body
+#fix		1 all nvt/body temp 1.44 1.44 1.0
 fix		2 all enforce2d
 
 #compute         1 all body/local type 1 2 3
 #dump            1 all local 100 dump.body index c_1[1] c_1[2] c_1[3] c_1[4]
 
-thermo          500
+#dump		2 all image 1000 image.*.jpg type type &
+#		zoom 1.6 adiam 1.5 body type 1.0 0
+#dump_modify    2 pad 5
+
+thermo          100
 run		10000
diff --git a/src/angle.cpp b/src/angle.cpp
index a4a7aa83d9..14433fc44d 100644
--- a/src/angle.cpp
+++ b/src/angle.cpp
@@ -40,9 +40,6 @@ Angle::Angle(LAMMPS *lmp) : Pointers(lmp)
   vatom = NULL;
   setflag = NULL;
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
diff --git a/src/angle.h b/src/angle.h
index 31d73176ac..119f206c36 100644
--- a/src/angle.h
+++ b/src/angle.h
@@ -29,10 +29,9 @@ class Angle : protected Pointers {
   double energy;                  // accumulated energies
   double virial[6];               // accumlated virial
   double *eatom,**vatom;          // accumulated per-atom energy/virial
-  unsigned int datamask;
-  unsigned int datamask_ext;
 
   // KOKKOS host/device flag and data masks
+
   ExecutionSpace execution_space;
   unsigned int datamask_read,datamask_modify;
   int copymode;
@@ -51,9 +50,6 @@ class Angle : protected Pointers {
   virtual double single(int, int, int, int) = 0;
   virtual double memory_usage();
 
-  virtual unsigned int data_mask() {return datamask;}
-  virtual unsigned int data_mask_ext() {return datamask_ext;}
-
  protected:
   int suffix_flag;             // suffix compatibility flag
 
diff --git a/src/atom.cpp b/src/atom.cpp
index 2db73d4325..8e48611284 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -208,9 +208,6 @@ Atom::Atom(LAMMPS *lmp) : Pointers(lmp)
   atom_style = NULL;
   avec = NULL;
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   avec_map = new AtomVecCreatorMap();
 
 #define ATOM_CLASS
diff --git a/src/atom.h b/src/atom.h
index 31fd355f9a..61cd9673bf 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -124,11 +124,6 @@ class Atom : protected Pointers {
   char **iname,**dname;
   int nivector,ndvector;
 
-  // used by USER-CUDA to flag used per-atom arrays
-
-  unsigned int datamask;
-  unsigned int datamask_ext;
-
   // atom style and per-atom array existence flags
   // customize by adding new flag
 
diff --git a/src/atom_vec.h b/src/atom_vec.h
index 267cc599d1..698cdef2f5 100644
--- a/src/atom_vec.h
+++ b/src/atom_vec.h
@@ -156,10 +156,6 @@ E: Invalid atom_style command
 
 Self-explanatory.
 
-E: USER-CUDA package requires a cuda enabled atom_style
-
-Self-explanatory.
-
 E: KOKKOS package requires a kokkos enabled atom_style
 
 Self-explanatory.
diff --git a/src/bond.cpp b/src/bond.cpp
index 5c2622281d..8074d97677 100644
--- a/src/bond.cpp
+++ b/src/bond.cpp
@@ -44,9 +44,6 @@ Bond::Bond(LAMMPS *lmp) : Pointers(lmp)
   vatom = NULL;
   setflag = NULL;
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
diff --git a/src/bond.h b/src/bond.h
index d455cda204..06e05a7265 100644
--- a/src/bond.h
+++ b/src/bond.h
@@ -29,10 +29,9 @@ class Bond : protected Pointers {
   double energy;                  // accumulated energies
   double virial[6];               // accumlated virial
   double *eatom,**vatom;          // accumulated per-atom energy/virial
-  unsigned int datamask;
-  unsigned int datamask_ext;
 
   // KOKKOS host/device flag and data masks
+
   ExecutionSpace execution_space;
   unsigned int datamask_read,datamask_modify;
   int copymode;
@@ -51,9 +50,6 @@ class Bond : protected Pointers {
   virtual double single(int, double, int, int, double &) = 0;
   virtual double memory_usage();
 
-  virtual unsigned int data_mask() {return datamask;}
-  virtual unsigned int data_mask_ext() {return datamask_ext;}
-
   void write_file(int, char**);
 
  protected:
diff --git a/src/comm_tiled.h b/src/comm_tiled.h
index 91feea7479..5d0fae1269 100644
--- a/src/comm_tiled.h
+++ b/src/comm_tiled.h
@@ -155,10 +155,6 @@ class CommTiled : public Comm {
 
 /* ERROR/WARNING messages:
 
-E: USER-CUDA package does not yet support comm_style tiled
-
-Self-explanatory.
-
 E: KOKKOS package does not yet support comm_style tiled
 
 Self-explanatory.
diff --git a/src/compute.cpp b/src/compute.cpp
index 96bf6ceb54..d306b0b34f 100644
--- a/src/compute.cpp
+++ b/src/compute.cpp
@@ -99,9 +99,6 @@ Compute::Compute(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp),
   
   // data masks
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
diff --git a/src/compute.h b/src/compute.h
index aae706b34d..18da971f82 100644
--- a/src/compute.h
+++ b/src/compute.h
@@ -84,9 +84,6 @@ class Compute : protected Pointers {
   int comm_reverse;         // size of reverse communication (0 if none)
   int dynamic_group_allow;  // 1 if can be used with dynamic group, else 0
 
-  unsigned int datamask;
-  unsigned int datamask_ext;
-
   // KOKKOS host/device flag and data masks
 
   ExecutionSpace execution_space;
@@ -140,9 +137,6 @@ class Compute : protected Pointers {
                                    double, double, double,
                                    double, double, double) {}
 
-  virtual int unsigned data_mask() {return datamask;}
-  virtual int unsigned data_mask_ext() {return datamask_ext;}
-
  protected:
   int instance_me;             // which Compute class instantiation I am
 
diff --git a/src/dihedral.cpp b/src/dihedral.cpp
index 8ea76b44a1..4c941d7225 100644
--- a/src/dihedral.cpp
+++ b/src/dihedral.cpp
@@ -41,9 +41,6 @@ Dihedral::Dihedral(LAMMPS *lmp) : Pointers(lmp)
   vatom = NULL;
   setflag = NULL;
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
diff --git a/src/dihedral.h b/src/dihedral.h
index 3e3ec0a4f2..68167eb86f 100644
--- a/src/dihedral.h
+++ b/src/dihedral.h
@@ -29,10 +29,9 @@ class Dihedral : protected Pointers {
   double energy;                     // accumulated energy
   double virial[6];                  // accumlated virial
   double *eatom,**vatom;             // accumulated per-atom energy/virial
-  unsigned int datamask;
-  unsigned int datamask_ext;
 
   // KOKKOS host/device flag and data masks
+
   ExecutionSpace execution_space;
   unsigned int datamask_read,datamask_modify;
   int copymode;
@@ -49,9 +48,6 @@ class Dihedral : protected Pointers {
   virtual void write_data(FILE *) {}
   virtual double memory_usage();
 
-  virtual unsigned int data_mask() {return datamask;}
-  virtual unsigned int data_mask_ext() {return datamask_ext;}
-
  protected:
   int suffix_flag;             // suffix compatibility flag
 
diff --git a/src/dump_custom.cpp b/src/dump_custom.cpp
index 033614789a..c10d6b1553 100644
--- a/src/dump_custom.cpp
+++ b/src/dump_custom.cpp
@@ -43,7 +43,7 @@ enum{ID,MOL,PROC,PROCP1,TYPE,ELEMENT,MASS,
      OMEGAX,OMEGAY,OMEGAZ,ANGMOMX,ANGMOMY,ANGMOMZ,
      TQX,TQY,TQZ,
      COMPUTE,FIX,VARIABLE,INAME,DNAME};
-enum{LT,LE,GT,GE,EQ,NEQ};
+enum{LT,LE,GT,GE,EQ,NEQ,XOR};
 enum{INT,DOUBLE,STRING,BIGINT};    // same as in DumpCFG
 
 #define INVOKED_PERATOM 8
@@ -947,6 +947,11 @@ int DumpCustom::count()
       } else if (thresh_op[ithresh] == NEQ) {
         for (i = 0; i < nlocal; i++, ptr += nstride)
           if (choose[i] && *ptr == value) choose[i] = 0;
+      } else if (thresh_op[ithresh] == XOR) {
+        for (i = 0; i < nlocal; i++, ptr += nstride)
+          if (choose[i] && (*ptr == 0.0 && value == 0.0) || 
+              (*ptr != 0.0 && value != 0.0))
+            choose[i] = 0;
       }
     }
   }
@@ -1835,6 +1840,7 @@ int DumpCustom::modify_param(int narg, char **arg)
     else if (strcmp(arg[2],">=") == 0) thresh_op[nthresh] = GE;
     else if (strcmp(arg[2],"==") == 0) thresh_op[nthresh] = EQ;
     else if (strcmp(arg[2],"!=") == 0) thresh_op[nthresh] = NEQ;
+    else if (strcmp(arg[2],"|^") == 0) thresh_op[nthresh] = XOR;
     else error->all(FLERR,"Invalid dump_modify threshold operator");
 
     // set threshold value
diff --git a/src/fix.cpp b/src/fix.cpp
index 5fd7764baa..9918e40e0f 100644
--- a/src/fix.cpp
+++ b/src/fix.cpp
@@ -95,10 +95,7 @@ id(NULL), style(NULL), eatom(NULL), vatom(NULL)
   maxeatom = maxvatom = 0;
   vflag_atom = 0;
 
-  // CUDA and KOKKOS per-fix data masks
-
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
+  // KOKKOS per-fix data masks
 
   execution_space = Host;
   datamask_read = ALL_MASK;
diff --git a/src/fix.h b/src/fix.h
index 9dfb642096..62cb565b15 100644
--- a/src/fix.h
+++ b/src/fix.h
@@ -99,11 +99,6 @@ class Fix : protected Pointers {
   ExecutionSpace execution_space;
   unsigned int datamask_read,datamask_modify;
 
-  // USER-CUDA per-fix data masks
-
-  unsigned int datamask;
-  unsigned int datamask_ext;
-
   Fix(class LAMMPS *, int, char **);
   virtual ~Fix();
   void modify_params(int, char **);
@@ -211,9 +206,6 @@ class Fix : protected Pointers {
 
   virtual double memory_usage() {return 0.0;}
 
-  virtual unsigned int data_mask() {return datamask;}
-  virtual unsigned int data_mask_ext() {return datamask_ext;}
-
  protected:
   int instance_me;        // which Fix class instantiation I am
 
diff --git a/src/imbalance_group.cpp b/src/imbalance_group.cpp
index 252e64568e..b8e7db44b6 100644
--- a/src/imbalance_group.cpp
+++ b/src/imbalance_group.cpp
@@ -18,12 +18,11 @@
 #include "error.h"
 
 using namespace LAMMPS_NS;
-#define SMALL 0.001
 
 /* -------------------------------------------------------------------- */
 
-ImbalanceGroup::ImbalanceGroup(LAMMPS *lmp) : Imbalance(lmp),
-                id(0), factor(0), num(0) {}
+ImbalanceGroup::ImbalanceGroup(LAMMPS *lmp) : Imbalance(lmp), id(0), factor(0)
+{}
 
 /* -------------------------------------------------------------------- */
 
@@ -50,7 +49,7 @@ int ImbalanceGroup::options(int narg, char **arg)
     if (id[i] < 0)
       error->all(FLERR,"Unknown group in balance weight command");
     factor[i] = force->numeric(FLERR,arg[2*i+2]);
-    if (factor[i] < 0.0) error->all(FLERR,"Illegal balance weight command");
+    if (factor[i] <= 0.0) error->all(FLERR,"Illegal balance weight command");
   }
   return 2*num+1;
 }
@@ -67,13 +66,10 @@ void ImbalanceGroup::compute(double *weight)
 
   for (int i = 0; i < nlocal; ++i) {
     const int imask = mask[i];
-    double iweight = weight[i];
     for (int j = 0; j < num; ++j) {
       if (imask & bitmask[id[j]])
-        iweight *= factor[j];
+        weight[i] *= factor[j];
     }
-    if (iweight < SMALL) weight[i] = SMALL;
-    else weight[i] = iweight;
   }
 }
 
diff --git a/src/imbalance_neigh.cpp b/src/imbalance_neigh.cpp
index c1f268df5a..24a6be2698 100644
--- a/src/imbalance_neigh.cpp
+++ b/src/imbalance_neigh.cpp
@@ -22,14 +22,14 @@
 #include "error.h"
 
 using namespace LAMMPS_NS;
-#define SMALL 0.001
+
+#define BIG 1.0e20
 
 /* -------------------------------------------------------------------- */
 
 ImbalanceNeigh::ImbalanceNeigh(LAMMPS *lmp) : Imbalance(lmp)
 {
   did_warn = 0;
-  factor = 1.0;
 }
 
 /* -------------------------------------------------------------------- */
@@ -38,8 +38,7 @@ int ImbalanceNeigh::options(int narg, char **arg)
 {
   if (narg < 1) error->all(FLERR,"Illegal balance weight command");
   factor = force->numeric(FLERR,arg[0]);
-  if ((factor < 0.0) || (factor > 2.0))
-    error->all(FLERR,"Illegal balance weight command");
+  if (factor <= 0.0) error->all(FLERR,"Illegal balance weight command");
   return 1;
 }
 
@@ -52,7 +51,7 @@ void ImbalanceNeigh::compute(double *weight)
   if (factor == 0.0) return;
 
   // find suitable neighbor list
-  // we can only make use of certain (conventional) neighbor lists
+  // can only use certain conventional neighbor lists
 
   for (req = 0; req < neighbor->old_nrequest; ++req) {
     if ((neighbor->old_requests[req]->half ||
@@ -65,37 +64,46 @@ void ImbalanceNeigh::compute(double *weight)
 
   if (req >= neighbor->old_nrequest || neighbor->ago < 0) {
     if (comm->me == 0 && !did_warn)
-      error->warning(FLERR,"No suitable neighbor list found. "
-                     "Neighbor weighted balancing skipped");
+      error->warning(FLERR,"Balance weight neigh skipped b/c no list found");
     did_warn = 1;
     return;
   }
 
+  // neighsum = total neigh count for atoms on this proc
+  // localwt = weight assigned to each owned atom
+
   NeighList *list = neighbor->lists[req];
-  bigint neighsum = 0;
-  
   const int inum = list->inum;
   const int * const ilist = list->ilist;
   const int * const numneigh = list->numneigh;
+  int nlocal = atom->nlocal;
 
-  // first pass: get local number of neighbors
-
+  bigint neighsum = 0;
   for (int i = 0; i < inum; ++i) neighsum += numneigh[ilist[i]];
+  double localwt = 0.0;
+  if (nlocal) localwt = 1.0*neighsum/nlocal;
 
-  double allatoms = static_cast <double>(atom->natoms);
-  if (allatoms == 0.0) allatoms = 1.0;
-  double allavg;
-  double myavg = static_cast<double>(neighsum)/allatoms;
-  MPI_Allreduce(&myavg,&allavg,1,MPI_DOUBLE,MPI_SUM,world);
-  
-  // second pass: compute and apply weights
+  if (nlocal && localwt <= 0.0) error->one(FLERR,"Balance weight <= 0.0");
 
-  double scale = 1.0/allavg;
-  for (int ii = 0; ii < inum; ++ii) {
-    const int i = ilist[ii];
-    weight[i] *= (1.0-factor) + factor*scale*numneigh[i];
-    if (weight[i] < SMALL) weight[i] = SMALL;
+  // apply factor if specified != 1.0
+  // wtlo,wthi = lo/hi values excluding 0.0 due to no atoms on this proc
+  // lo value does not change
+  // newhi = new hi value to give hi/lo ratio factor times larger/smaller
+  // expand/contract all localwt values from lo->hi to lo->newhi
+
+  if (factor != 1.0) {
+    double wtlo,wthi;
+    if (localwt == 0.0) localwt = BIG;
+    MPI_Allreduce(&localwt,&wtlo,1,MPI_DOUBLE,MPI_MIN,world);
+    if (localwt == BIG) localwt = 0.0;
+    MPI_Allreduce(&localwt,&wthi,1,MPI_DOUBLE,MPI_MAX,world);
+    if (wtlo == wthi) return;
+
+    double newhi = wthi*factor;
+    localwt = wtlo + ((localwt-wtlo)/(wthi-wtlo)) * (newhi-wtlo);
   }
+
+  for (int i = 0; i < nlocal; i++) weight[i] *= localwt;
 }
 
 /* -------------------------------------------------------------------- */
diff --git a/src/imbalance_time.cpp b/src/imbalance_time.cpp
index 0f99eef255..27a7537851 100644
--- a/src/imbalance_time.cpp
+++ b/src/imbalance_time.cpp
@@ -19,15 +19,16 @@
 #include "timer.h"
 #include "error.h"
 
+// DEBUG
+#include "update.h"
+
 using namespace LAMMPS_NS;
-#define SMALL 0.001
+
+#define BIG 1.0e20
 
 /* -------------------------------------------------------------------- */
 
-ImbalanceTime::ImbalanceTime(LAMMPS *lmp) : Imbalance(lmp)
-{
-  factor = 1.0;
-}
+ImbalanceTime::ImbalanceTime(LAMMPS *lmp) : Imbalance(lmp) {}
 
 /* -------------------------------------------------------------------- */
 
@@ -35,8 +36,7 @@ int ImbalanceTime::options(int narg, char **arg)
 {
   if (narg < 1) error->all(FLERR,"Illegal balance weight command");
   factor = force->numeric(FLERR,arg[0]);
-  if ((factor < 0.0) || (factor > 2.0))
-    error->all(FLERR,"Illegal balance weight command");
+  if (factor <= 0.0) error->all(FLERR,"Illegal balance weight command");
   return 1;
 }
 
@@ -53,37 +53,60 @@ void ImbalanceTime::init()
 
 void ImbalanceTime::compute(double *weight)
 {
-  const int nlocal = atom->nlocal;
-  const bigint natoms = atom->natoms;
+  if (!timer->has_normal()) return;
 
-  if (factor == 0.0) return;
+  // cost = CPU time for relevant timers since last invocation
+  // localwt = weight assigned to each owned atom
+  // just return if no time yet tallied
 
-  // compute the cost function of based on relevant timers
-  
-  if (timer->has_normal()) {
-      double cost = -last;
-      cost += timer->get_wall(Timer::PAIR);
-      cost += timer->get_wall(Timer::NEIGH);
-      cost += timer->get_wall(Timer::BOND);
-      cost += timer->get_wall(Timer::KSPACE);
+  double cost = -last;
+  cost += timer->get_wall(Timer::PAIR);
+  cost += timer->get_wall(Timer::NEIGH);
+  cost += timer->get_wall(Timer::BOND);
+  cost += timer->get_wall(Timer::KSPACE);
 
-      double allcost;
-      MPI_Allreduce(&cost,&allcost,1,MPI_DOUBLE,MPI_SUM,world);
+  /*
+  printf("TIME %ld %d %g %g: %g %g %g %g\n",
+         update->ntimestep,atom->nlocal,last,cost,
+         timer->get_wall(Timer::PAIR),
+         timer->get_wall(Timer::NEIGH),
+         timer->get_wall(Timer::BOND),
+         timer->get_wall(Timer::KSPACE));
+  */
 
-      if ((allcost > 0.0) && (nlocal > 0)) {
-        const double avgcost = allcost/natoms;
-        const double localcost = cost/nlocal;
-        const double scale = (1.0-factor) + factor*localcost/avgcost;
-        for (int i = 0; i < nlocal; ++i) {
-          weight[i] *= scale;
-          if (weight[i] < SMALL) weight[i] = SMALL;
-        }
-      }
+  double maxcost;
+  MPI_Allreduce(&cost,&maxcost,1,MPI_DOUBLE,MPI_MAX,world);
+  if (maxcost <= 0.0) return;
 
-      // record time up to this point
+  int nlocal = atom->nlocal;
+  double localwt = 0.0;
+  if (nlocal) localwt = cost/nlocal;
 
-      last += cost;
+  if (nlocal && localwt <= 0.0) error->one(FLERR,"Balance weight <= 0.0");
+
+  // apply factor if specified != 1.0
+  // wtlo,wthi = lo/hi values excluding 0.0 due to no atoms on this proc
+  // lo value does not change
+  // newhi = new hi value to give hi/lo ratio factor times larger/smaller
+  // expand/contract all localwt values from lo->hi to lo->newhi
+
+  if (factor != 1.0) {
+    double wtlo,wthi;
+    if (localwt == 0.0) localwt = BIG;
+    MPI_Allreduce(&localwt,&wtlo,1,MPI_DOUBLE,MPI_MIN,world);
+    if (localwt == BIG) localwt = 0.0;
+    MPI_Allreduce(&localwt,&wthi,1,MPI_DOUBLE,MPI_MAX,world);
+    if (wtlo == wthi) return;
+
+    double newhi = wthi*factor;
+    localwt = wtlo + ((localwt-wtlo)/(wthi-wtlo)) * (newhi-wtlo);
   }
+
+  for (int i = 0; i < nlocal; i++) weight[i] *= localwt;
+  
+  // record time up to this point
+  
+  last += cost;
 }
 
 /* -------------------------------------------------------------------- */
diff --git a/src/imbalance_var.cpp b/src/imbalance_var.cpp
index 3f1f429a39..a5fd9084e5 100644
--- a/src/imbalance_var.cpp
+++ b/src/imbalance_var.cpp
@@ -24,11 +24,10 @@
 #include "update.h"
 
 using namespace LAMMPS_NS;
-#define SMALL 0.001
 
 /* -------------------------------------------------------------------- */
 
-ImbalanceVar::ImbalanceVar(LAMMPS *lmp) : Imbalance(lmp), name(0), id(0) {}
+ImbalanceVar::ImbalanceVar(LAMMPS *lmp) : Imbalance(lmp), name(0) {}
 
 /* -------------------------------------------------------------------- */
 
@@ -76,10 +75,15 @@ void ImbalanceVar::compute(double *weight)
   memory->create(values,nlocal,"imbalance:values");
 
   input->variable->compute_atom(id,all,values,1,0);
-  for (int i = 0; i < nlocal; ++i) {
-    weight[i] *= values[i];
-    if (weight[i] < SMALL) weight[i] = SMALL;
-  }
+
+  int flag = 0;
+  for (int i = 0; i < nlocal; i++)
+    if (values[i] <= 0.0) flag = 1;
+  int flagall;
+  MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
+  if (flagall) error->one(FLERR,"Balance weight <= 0.0");
+
+  for (int i = 0; i < nlocal; i++) weight[i] *= values[i];
 
   memory->destroy(values);
 }
diff --git a/src/improper.cpp b/src/improper.cpp
index e56a36cc01..e20d0d2666 100644
--- a/src/improper.cpp
+++ b/src/improper.cpp
@@ -38,9 +38,6 @@ Improper::Improper(LAMMPS *lmp) : Pointers(lmp)
   vatom = NULL;
   setflag = NULL;
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
diff --git a/src/improper.h b/src/improper.h
index e029233ae9..b20bc732df 100644
--- a/src/improper.h
+++ b/src/improper.h
@@ -29,10 +29,9 @@ class Improper : protected Pointers {
   double energy;                  // accumulated energies
   double virial[6];               // accumlated virial
   double *eatom,**vatom;          // accumulated per-atom energy/virial
-  unsigned int datamask;
-  unsigned int datamask_ext;
 
   // KOKKOS host/device flag and data masks
+
   ExecutionSpace execution_space;
   unsigned int datamask_read,datamask_modify;
   int copymode;
@@ -49,9 +48,6 @@ class Improper : protected Pointers {
   virtual void write_data(FILE *) {}
   virtual double memory_usage();
 
-  virtual unsigned int data_mask() {return datamask;}
-  virtual unsigned int data_mask_ext() {return datamask_ext;}
-
  protected:
   int suffix_flag;             // suffix compatibility flag
 
diff --git a/src/input.h b/src/input.h
index 33a271ccdf..7f9cefe064 100644
--- a/src/input.h
+++ b/src/input.h
@@ -328,12 +328,6 @@ E: Package command after simulation box is defined
 The package command cannot be used afer a read_data, read_restart, or
 create_box command.
 
-E: Package cuda command without USER-CUDA package enabled
-
-The USER-CUDA package must be installed via "make yes-user-cuda"
-before LAMMPS is built, and the "-c on" must be used to enable the
-package.
-
 E: Package gpu command without GPU package installed
 
 The GPU package must be installed via "make yes-gpu" before LAMMPS is
diff --git a/src/kspace.cpp b/src/kspace.cpp
index d5123958a1..ca0d500195 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -88,9 +88,6 @@ KSpace::KSpace(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   eatom = NULL;
   vatom = NULL;
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
diff --git a/src/kspace.h b/src/kspace.h
index e52fb1ec66..95fb6ffaf2 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -80,10 +80,8 @@ class KSpace : protected Pointers {
 
   int group_group_enable;         // 1 if style supports group/group calculation
 
-  unsigned int datamask;
-  unsigned int datamask_ext;
-
   // KOKKOS host/device flag and data masks
+
   ExecutionSpace execution_space;
   unsigned int datamask_read,datamask_modify;
   int copymode;
diff --git a/src/lammps.h b/src/lammps.h
index 3978f99111..9d8ca954a6 100644
--- a/src/lammps.h
+++ b/src/lammps.h
@@ -168,19 +168,10 @@ E: Cannot use -cuda on and -kokkos on together
 
 This is not allowed since both packages can use GPUs.
 
-E: Cannot use -cuda on without USER-CUDA installed
-
-The USER-CUDA package must be installed via "make yes-user-cuda"
-before LAMMPS is built.
-
 E: Cannot use -kokkos on without KOKKOS installed
 
 Self-explanatory.
 
-E: Using suffix cuda without USER-CUDA package enabled
-
-Self-explanatory.
-
 E: Using suffix gpu without GPU package installed
 
 Self-explanatory.
diff --git a/src/pair.cpp b/src/pair.cpp
index 3f1212b345..651cabed60 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -100,9 +100,6 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp)
 
   // KOKKOS per-fix data masks
 
-  datamask = ALL_MASK;
-  datamask_ext = ALL_MASK;
-
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
diff --git a/src/pair.h b/src/pair.h
index dbbead0d6c..3378115e49 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -97,9 +97,6 @@ class Pair : protected Pointers {
   class NeighList *listmiddle;
   class NeighList *listouter;
 
-  unsigned int datamask;
-  unsigned int datamask_ext;
-
   int allocated;                 // 0/1 = whether arrays are allocated
                                  //       public so external driver can check
   int compute_flag;              // 0 if skip compute()
@@ -191,9 +188,6 @@ class Pair : protected Pointers {
   virtual void min_xf_get(int) {}
   virtual void min_x_set(int) {}
 
-  virtual unsigned int data_mask() {return datamask;}
-  virtual unsigned int data_mask_ext() {return datamask_ext;}
-
   // management of callbacks to be run from ev_tally()
 
  protected:
diff --git a/src/suffix.h b/src/suffix.h
index 4ae80af895..817600dfd5 100644
--- a/src/suffix.h
+++ b/src/suffix.h
@@ -20,9 +20,8 @@ namespace Suffix {
   static const int NONE = 0;
   static const int OPT  = 1<<0;
   static const int GPU  = 1<<1;
-  static const int CUDA = 1<<2;
-  static const int OMP  = 1<<3;
-  static const int INTEL  = 1<<4;
+  static const int OMP  = 1<<2;
+  static const int INTEL  = 1<<3;
 }
 
 }
diff --git a/src/update.h b/src/update.h
index fe17dc6bb4..7996440318 100644
--- a/src/update.h
+++ b/src/update.h
@@ -81,14 +81,6 @@ class Update : protected Pointers {
 
 /* ERROR/WARNING messages:
 
-E: USER-CUDA mode requires CUDA variant of run style
-
-CUDA mode is enabled, so the run style must include a cuda suffix.
-
-E: USER-CUDA mode requires CUDA variant of min style
-
-CUDA mode is enabled, so the min style must include a cuda suffix.
-
 E: Illegal ... command
 
 Self-explanatory.  Check the input script syntax and compare to the
diff --git a/src/variable.cpp b/src/variable.cpp
index 0d4f6c9406..be09ebe69f 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -4813,72 +4813,6 @@ double Variable::evaluate_boolean(char *str)
   return argstack[0].value;
 }
 
-/* ---------------------------------------------------------------------- */
-
-unsigned int Variable::data_mask(int ivar)
-{
-  if (eval_in_progress[ivar]) return EMPTY_MASK;
-  eval_in_progress[ivar] = 1;
-  unsigned int datamask = data_mask(data[ivar][0]);
-  eval_in_progress[ivar] = 0;
-  return datamask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-unsigned int Variable::data_mask(char *str)
-{
-  unsigned int datamask = EMPTY_MASK;
-
-  for (unsigned int i = 0; i < strlen(str)-2; i++) {
-    int istart = i;
-    while (isalnum(str[i]) || str[i] == '_') i++;
-    int istop = i-1;
-
-    int n = istop - istart + 1;
-    char *word = new char[n+1];
-    strncpy(word,&str[istart],n);
-    word[n] = '\0';
-
-    // ----------------
-    // compute
-    // ----------------
-
-    if ((strncmp(word,"c_",2) == 0) && (i>0) && (!isalnum(str[i-1]))) {
-      if (domain->box_exist == 0)
-        error->all(FLERR,
-                   "Variable evaluation before simulation box is defined");
-
-      int icompute = modify->find_compute(word+2);
-      if (icompute < 0)
-        error->all(FLERR,"Invalid compute ID in variable formula");
-
-      datamask &= modify->compute[icompute]->data_mask();
-    }
-
-    if ((strncmp(word,"f_",2) == 0) && (i>0) && (!isalnum(str[i-1]))) {
-      if (domain->box_exist == 0)
-        error->all(FLERR,
-                   "Variable evaluation before simulation box is defined");
-
-      int ifix = modify->find_fix(word+2);
-      if (ifix < 0) error->all(FLERR,"Invalid fix ID in variable formula");
-
-      datamask &= modify->fix[ifix]->data_mask();
-    }
-
-    if ((strncmp(word,"v_",2) == 0) && (i>0) && (!isalnum(str[i-1]))) {
-      int ivar = find(word+2);
-      if (ivar < 0) error->all(FLERR,"Invalid variable name in variable formula");
-      datamask &= data_mask(ivar);
-    }
-
-    delete [] word;
-  }
-
-  return datamask;
-}
-
 /* ----------------------------------------------------------------------
    class to read variable values from a file
    for flag = SCALARFILE, reads one value per line
diff --git a/src/variable.h b/src/variable.h
index b773994782..cdcc607b18 100644
--- a/src/variable.h
+++ b/src/variable.h
@@ -49,9 +49,6 @@ class Variable : protected Pointers {
   tagint int_between_brackets(char *&, int);
   double evaluate_boolean(char *);
 
-  unsigned int data_mask(int ivar);
-  unsigned int data_mask(char *str);
-
  private:
   int me;
   int nvar;                // # of defined variables
diff --git a/src/version.h b/src/version.h
index 4009b5bdc4..7f68648837 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "30 Sep 2016"
+#define LAMMPS_VERSION "5 Oct 2016"