lammps/src/balance.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://lammps.sandia.gov/, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors, for weighted balancing:
     Axel Kohlmeyer (Temple U), Iain Bethune (EPCC)
------------------------------------------------------------------------- */

// #define BALANCE_DEBUG 1

#include "balance.h"

#include "update.h"
#include "atom.h"
#include "neighbor.h"
#include "comm.h"
#include "domain.h"
#include "fix_store.h"
#include "imbalance.h"
#include "imbalance_group.h"
#include "imbalance_neigh.h"
#include "imbalance_store.h"
#include "imbalance_time.h"
#include "imbalance_var.h"
#include "irregular.h"
#include "memory.h"
#include "modify.h"
#include "rcb.h"
#include "error.h"

#include <cmath>
#include <cstring>

using namespace LAMMPS_NS;

double EPSNEIGH = 1.0e-3;

enum{XYZ,SHIFT,BISECTION};
enum{NONE,UNIFORM,USER};
enum{X,Y,Z};

/* ---------------------------------------------------------------------- */

Balance::Balance(LAMMPS *lmp) : Command(lmp)
{
  MPI_Comm_rank(world,&me);
  MPI_Comm_size(world,&nprocs);

  user_xsplit = user_ysplit = user_zsplit = nullptr;
  shift_allocate = 0;
  proccost = allproccost = nullptr;

  rcb = nullptr;

  nimbalance = 0;
  imbalances = nullptr;
  fixstore = nullptr;

  fp = nullptr;
  firststep = 1;
}

/* ---------------------------------------------------------------------- */

Balance::~Balance()
{
  memory->destroy(proccost);
  memory->destroy(allproccost);

  delete [] user_xsplit;
  delete [] user_ysplit;
  delete [] user_zsplit;

  if (shift_allocate) {
    delete [] bdim;
    delete [] onecost;
    delete [] allcost;
    delete [] sum;
    delete [] target;
    delete [] lo;
    delete [] hi;
    delete [] losum;
    delete [] hisum;
  }

  delete rcb;

  for (int i = 0; i < nimbalance; i++) delete imbalances[i];
  delete [] imbalances;

  // check nfix in case all fixes have already been deleted

  if (fixstore && modify->nfix) modify->delete_fix(fixstore->id);
  fixstore = nullptr;

  if (fp) fclose(fp);
}

/* ----------------------------------------------------------------------
   called as balance command in input script
------------------------------------------------------------------------- */

void Balance::command(int narg, char **arg)
{
  if (domain->box_exist == 0)
    error->all(FLERR,"Balance command before simulation box is defined");

  if (me == 0) utils::logmesg(lmp,"Balancing ...\n");

  // parse required arguments

  if (narg < 2) error->all(FLERR,"Illegal balance command");

  thresh = utils::numeric(FLERR,arg[0],false,lmp);

  int dimension = domain->dimension;
  int *procgrid = comm->procgrid;
  style = -1;
  xflag = yflag = zflag = NONE;

  int iarg = 1;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"x") == 0) {
      if (style != -1 && style != XYZ)
        error->all(FLERR,"Illegal balance command");
      style = XYZ;
      if (strcmp(arg[iarg+1],"uniform") == 0) {
        if (iarg+2 > narg) error->all(FLERR,"Illegal balance command");
        xflag = UNIFORM;
        iarg += 2;
      } else {
        if (1 + procgrid[0]-1 > narg)
          error->all(FLERR,"Illegal balance command");
        xflag = USER;
        delete [] user_xsplit;
        user_xsplit = new double[procgrid[0]+1];
        user_xsplit[0] = 0.0;
        iarg++;
        for (int i = 1; i < procgrid[0]; i++)
          user_xsplit[i] = utils::numeric(FLERR,arg[iarg++],false,lmp);
        user_xsplit[procgrid[0]] = 1.0;
      }
    } else if (strcmp(arg[iarg],"y") == 0) {
      if (style != -1 && style != XYZ)
        error->all(FLERR,"Illegal balance command");
      style = XYZ;
      if (strcmp(arg[iarg+1],"uniform") == 0) {
        if (iarg+2 > narg) error->all(FLERR,"Illegal balance command");
        yflag = UNIFORM;
        iarg += 2;
      } else {
        if (1 + procgrid[1]-1 > narg)
          error->all(FLERR,"Illegal balance command");
        yflag = USER;
        delete [] user_ysplit;
        user_ysplit = new double[procgrid[1]+1];
        user_ysplit[0] = 0.0;
        iarg++;
        for (int i = 1; i < procgrid[1]; i++)
          user_ysplit[i] = utils::numeric(FLERR,arg[iarg++],false,lmp);
        user_ysplit[procgrid[1]] = 1.0;
      }
    } else if (strcmp(arg[iarg],"z") == 0) {
      if (style != -1 && style != XYZ)
        error->all(FLERR,"Illegal balance command");
      style = XYZ;
      if (strcmp(arg[iarg+1],"uniform") == 0) {
        if (iarg+2 > narg) error->all(FLERR,"Illegal balance command");
        zflag = UNIFORM;
        iarg += 2;
      } else {
        if (1 + procgrid[2]-1 > narg)
          error->all(FLERR,"Illegal balance command");
        zflag = USER;
        delete [] user_zsplit;
        user_zsplit = new double[procgrid[2]+1];
        user_zsplit[0] = 0.0;
        iarg++;
        for (int i = 1; i < procgrid[2]; i++)
          user_zsplit[i] = utils::numeric(FLERR,arg[iarg++],false,lmp);
        user_zsplit[procgrid[2]] = 1.0;
      }

    } else if (strcmp(arg[iarg],"shift") == 0) {
      if (style != -1) error->all(FLERR,"Illegal balance command");
      if (iarg+4 > narg) error->all(FLERR,"Illegal balance command");
      style = SHIFT;
      if (strlen(arg[iarg+1]) > 3) error->all(FLERR,"Illegal balance command");
      strcpy(bstr,arg[iarg+1]);
      nitermax = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
      if (nitermax <= 0) error->all(FLERR,"Illegal balance command");
      stopthresh = utils::numeric(FLERR,arg[iarg+3],false,lmp);
      if (stopthresh < 1.0) error->all(FLERR,"Illegal balance command");
      iarg += 4;

    } else if (strcmp(arg[iarg],"rcb") == 0) {
      if (style != -1) error->all(FLERR,"Illegal balance command");
      style = BISECTION;
      iarg++;

    } else break;
  }

  // error checks

  if (style == XYZ) {
    if (zflag != NONE  && dimension == 2)
      error->all(FLERR,"Cannot balance in z dimension for 2d simulation");

    if (xflag == USER)
      for (int i = 1; i <= procgrid[0]; i++)
        if (user_xsplit[i-1] >= user_xsplit[i])
          error->all(FLERR,"Illegal balance command");
    if (yflag == USER)
      for (int i = 1; i <= procgrid[1]; i++)
        if (user_ysplit[i-1] >= user_ysplit[i])
          error->all(FLERR,"Illegal balance command");
    if (zflag == USER)
      for (int i = 1; i <= procgrid[2]; i++)
        if (user_zsplit[i-1] >= user_zsplit[i])
          error->all(FLERR,"Illegal balance command");
  }

  if (style == SHIFT) {
    const int blen=strlen(bstr);
    for (int i = 0; i < blen; i++) {
      if (bstr[i] != 'x' && bstr[i] != 'y' && bstr[i] != 'z')
        error->all(FLERR,"Balance shift string is invalid");
      if (bstr[i] == 'z' && dimension == 2)
        error->all(FLERR,"Balance shift string is invalid");
      for (int j = i+1; j < blen; j++)
        if (bstr[i] == bstr[j])
          error->all(FLERR,"Balance shift string is invalid");
    }
  }

  if (style == BISECTION && comm->style == 0)
    error->all(FLERR,"Balance rcb cannot be used with comm_style brick");

  // process remaining optional args

  options(iarg,narg,arg);
  if (wtflag) weight_storage(nullptr);

  // insure particles are in current box & update box via shrink-wrap
  // init entire system since comm->setup is done
  // comm::init needs neighbor::init needs pair::init needs kspace::init, etc
  // must reset atom map after exchange() since it clears it

  MPI_Barrier(world);
  double start_time = MPI_Wtime();

  lmp->init();

  if (domain->triclinic) domain->x2lamda(atom->nlocal);
  domain->pbc();
  domain->reset_box();
  comm->setup();
  comm->exchange();
  if (atom->map_style != Atom::MAP_NONE) atom->map_set();
  if (domain->triclinic) domain->lamda2x(atom->nlocal);

  // imbinit = initial imbalance

  double maxinit;
  init_imbalance(0);
  set_weights();
  double imbinit = imbalance_factor(maxinit);

  // no load-balance if imbalance doesn't exceed threshold
  // unless switching from tiled to non tiled layout, then force rebalance

  if (comm->layout == Comm::LAYOUT_TILED && style != BISECTION) {
  } else if (imbinit < thresh) return;

  // debug output of initial state

#ifdef BALANCE_DEBUG
  if (outflag) dumpout(update->ntimestep);
#endif

  int niter = 0;

  // perform load-balance
  // style XYZ = explicit setting of cutting planes of logical 3d grid

  if (style == XYZ) {
    if (comm->layout == Comm::LAYOUT_UNIFORM) {
      if (xflag == USER || yflag == USER || zflag == USER)
        comm->layout = Comm::LAYOUT_NONUNIFORM;
    } else if (comm->layout == Comm::LAYOUT_NONUNIFORM) {
      if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
        comm->layout = Comm::LAYOUT_UNIFORM;
    } else if (comm->layout == Comm::LAYOUT_TILED) {
      if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM)
        comm->layout = Comm::LAYOUT_UNIFORM;
      else comm->layout = Comm::LAYOUT_NONUNIFORM;
    }

    if (xflag == UNIFORM) {
      for (int i = 0; i < procgrid[0]; i++)
        comm->xsplit[i] = i * 1.0/procgrid[0];
      comm->xsplit[procgrid[0]] = 1.0;
    } else if (xflag == USER)
      for (int i = 0; i <= procgrid[0]; i++) comm->xsplit[i] = user_xsplit[i];

    if (yflag == UNIFORM) {
      for (int i = 0; i < procgrid[1]; i++)
        comm->ysplit[i] = i * 1.0/procgrid[1];
      comm->ysplit[procgrid[1]] = 1.0;
    } else if (yflag == USER)
      for (int i = 0; i <= procgrid[1]; i++) comm->ysplit[i] = user_ysplit[i];

    if (zflag == UNIFORM) {
      for (int i = 0; i < procgrid[2]; i++)
        comm->zsplit[i] = i * 1.0/procgrid[2];
      comm->zsplit[procgrid[2]] = 1.0;
    } else if (zflag == USER)
      for (int i = 0; i <= procgrid[2]; i++) comm->zsplit[i] = user_zsplit[i];
  }

  // style SHIFT = adjust cutting planes of logical 3d grid

  if (style == SHIFT) {
    comm->layout = Comm::LAYOUT_NONUNIFORM;
    shift_setup_static(bstr);
    niter = shift();
  }

  // style BISECTION = recursive coordinate bisectioning

  if (style == BISECTION) {
    comm->layout = Comm::LAYOUT_TILED;
    bisection(1);
  }

  // reset proc sub-domains
  // for either brick or tiled comm style

  if (domain->triclinic) domain->set_lamda_box();
  domain->set_local_box();

  // move particles to new processors via irregular()
  // set disable = 0, so weights migrate with atoms for imbfinal calculation

  if (domain->triclinic) domain->x2lamda(atom->nlocal);
  Irregular *irregular = new Irregular(lmp);
  if (wtflag) fixstore->disable = 0;
  if (style == BISECTION) irregular->migrate_atoms(1,1,rcb->sendproc);
  else irregular->migrate_atoms(1);
  delete irregular;
  if (domain->triclinic) domain->lamda2x(atom->nlocal);

  // output of final result

  if (outflag) dumpout(update->ntimestep);

  // check if any particles were lost

  bigint natoms;
  bigint nblocal = atom->nlocal;
  MPI_Allreduce(&nblocal,&natoms,1,MPI_LMP_BIGINT,MPI_SUM,world);
  if (natoms != atom->natoms)
    error->all(FLERR,fmt::format("Lost atoms via balance: "
                                 "original {}  current {}",
                                 atom->natoms,natoms).c_str());

  // imbfinal = final imbalance
  // set disable = 1, so weights no longer migrate with atoms

  double maxfinal;
  double imbfinal = imbalance_factor(maxfinal);
  if (wtflag) fixstore->disable = 1;

  // stats output

  if (me == 0) {
    std::string mesg = fmt::format(" rebalancing time: {:.3f} seconds\n",
                                   MPI_Wtime()-start_time);
    mesg += fmt::format("  iteration count = {}\n",niter);
    for (int i = 0; i < nimbalance; ++i) mesg += imbalances[i]->info();
    mesg += fmt::format("  initial/final maximal load/proc = {:.8} {:.8}\n"
                        "  initial/final imbalance factor  = {:.8} {:.8}\n",
                        maxinit,maxfinal,imbinit,imbfinal);

    if (style != BISECTION) {
      mesg += "  x cuts:";
      for (int i = 0; i <= comm->procgrid[0]; i++)
        mesg += fmt::format(" {:.8}",comm->xsplit[i]);
      mesg += "\n  y cuts:";
      for (int i = 0; i <= comm->procgrid[1]; i++)
        mesg += fmt::format(" {:.8}",comm->ysplit[i]);
      mesg += "\n  z cuts:";
      for (int i = 0; i <= comm->procgrid[2]; i++)
        mesg += fmt::format(" {:.8}",comm->zsplit[i]);
      mesg += "\n";
    }

    utils::logmesg(lmp,mesg);
  }
}

/* ----------------------------------------------------------------------
   process optional command args for Balance and FixBalance
------------------------------------------------------------------------- */

void Balance::options(int iarg, int narg, char **arg)
{
  // count max number of weight settings

  nimbalance = 0;
  for (int i = iarg; i < narg; i++)
    if (strcmp(arg[i],"weight") == 0) nimbalance++;
  if (nimbalance) imbalances = new Imbalance*[nimbalance];
  nimbalance = 0;

  wtflag = 0;
  varflag = 0;
  oldrcb = 0;
  outflag = 0;
  int outarg = 0;
  fp = nullptr;

  while (iarg < narg) {
    if (strcmp(arg[iarg],"weight") == 0) {
      wtflag = 1;
      Imbalance *imb;
      int nopt = 0;
      if (strcmp(arg[iarg+1],"group") == 0) {
        imb = new ImbalanceGroup(lmp);
        nopt = imb->options(narg-iarg,arg+iarg+2);
        imbalances[nimbalance++] = imb;
      } else if (strcmp(arg[iarg+1],"time") == 0) {
        imb = new ImbalanceTime(lmp);
        nopt = imb->options(narg-iarg,arg+iarg+2);
        imbalances[nimbalance++] = imb;
      } else if (strcmp(arg[iarg+1],"neigh") == 0) {
        imb = new ImbalanceNeigh(lmp);
        nopt = imb->options(narg-iarg,arg+iarg+2);
        imbalances[nimbalance++] = imb;
      } else if (strcmp(arg[iarg+1],"var") == 0) {
        varflag = 1;
        imb = new ImbalanceVar(lmp);
        nopt = imb->options(narg-iarg,arg+iarg+2);
        imbalances[nimbalance++] = imb;
      } else if (strcmp(arg[iarg+1],"store") == 0) {
        imb = new ImbalanceStore(lmp);
        nopt = imb->options(narg-iarg,arg+iarg+2);
        imbalances[nimbalance++] = imb;
      } else {
        error->all(FLERR,"Unknown (fix) balance weight method");
      }
      iarg += 2+nopt;

    } else if (strcmp(arg[iarg],"old") == 0) {
      oldrcb = 1;
      iarg++;
    } else if (strcmp(arg[iarg],"out") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal (fix) balance command");
      outflag = 1;
      outarg = iarg+1;
      iarg += 2;
    } else error->all(FLERR,"Illegal (fix) balance command");
  }

  // output file

  if (outflag && comm->me == 0) {
    fp = fopen(arg[outarg],"w");
    if (fp == nullptr)
      error->one(FLERR,fmt::format("Cannot open (fix) balance output file {}: {}",
                                   arg[outarg], utils::getsyserror()));
  }
}

/* ----------------------------------------------------------------------
   allocate per-particle weight storage via FixStore
   use prefix to distinguish Balance vs FixBalance storage
   fix could already be allocated if fix balance is re-specified
------------------------------------------------------------------------- */

void Balance::weight_storage(char *prefix)
{
  std::string cmd = "";

  if (prefix) cmd = prefix;
  cmd += "IMBALANCE_WEIGHTS";

  int ifix = modify->find_fix(cmd);
  if (ifix < 1) {
    cmd += " all STORE peratom 0 1";
    modify->add_fix(cmd);
    fixstore = (FixStore *) modify->fix[modify->nfix-1];
  } else fixstore = (FixStore *) modify->fix[ifix];

  // do not carry weights with atoms during normal atom migration

  fixstore->disable = 1;
}

/* ----------------------------------------------------------------------
   invoke init() for each Imbalance class
   flag = 0 for call from Balance, 1 for call from FixBalance
------------------------------------------------------------------------- */

void Balance::init_imbalance(int flag)
{
  if (!wtflag) return;
  for (int n = 0; n < nimbalance; n++) imbalances[n]->init(flag);
}

/* ----------------------------------------------------------------------
   set weight for each particle
   via list of Nimbalance classes
------------------------------------------------------------------------- */

void Balance::set_weights()
{
  if (!wtflag) return;
  weight = fixstore->vstore;

  int nlocal = atom->nlocal;
  for (int i = 0; i < nlocal; i++) weight[i] = 1.0;
  for (int n = 0; n < nimbalance; n++) imbalances[n]->compute(weight);
}

/* ----------------------------------------------------------------------
   calculate imbalance factor based on particle count or particle weights
   return max = max load per proc
   return imbalance = max load per proc / ave load per proc
------------------------------------------------------------------------- */

double Balance::imbalance_factor(double &maxcost)
{
  double mycost,totalcost;

  if (wtflag) {
    weight = fixstore->vstore;
    int nlocal = atom->nlocal;

    mycost = 0.0;
    for (int i = 0; i < nlocal; i++) mycost += weight[i];

  } else mycost = atom->nlocal;

  MPI_Allreduce(&mycost,&maxcost,1,MPI_DOUBLE,MPI_MAX,world);
  MPI_Allreduce(&mycost,&totalcost,1,MPI_DOUBLE,MPI_SUM,world);

  double imbalance = 1.0;
  if (maxcost > 0.0) imbalance = maxcost / (totalcost/nprocs);
  return imbalance;
}

/* ----------------------------------------------------------------------
   perform balancing via RCB class
   sortflag = flag for sorting order of received messages by proc ID
   return list of procs to send my atoms to
------------------------------------------------------------------------- */

int *Balance::bisection(int sortflag)
{
  if (!rcb) rcb = new RCB(lmp);

  int dim = domain->dimension;
  int triclinic = domain->triclinic;

  double *boxlo,*boxhi,*prd;

  if (triclinic == 0) {
    boxlo = domain->boxlo;
    boxhi = domain->boxhi;
    prd = domain->prd;
  } else {
    boxlo = domain->boxlo_lamda;
    boxhi = domain->boxhi_lamda;
    prd = domain->prd_lamda;
  }

  // shrink-wrap simulation box around atoms for input to RCB
  // leads to better-shaped sub-boxes when atoms are far from box boundaries
  // if triclinic, do this in lamda coords

  double shrink[6],shrinkall[6];

  shrink[0] = boxhi[0]; shrink[1] = boxhi[1]; shrink[2] = boxhi[2];
  shrink[3] = boxlo[0]; shrink[4] = boxlo[1]; shrink[5] = boxlo[2];

  double **x = atom->x;
  int nlocal = atom->nlocal;

  if (triclinic) domain->x2lamda(nlocal);

  for (int i = 0; i < nlocal; i++) {
    shrink[0] = MIN(shrink[0],x[i][0]);
    shrink[1] = MIN(shrink[1],x[i][1]);
    shrink[2] = MIN(shrink[2],x[i][2]);
    shrink[3] = MAX(shrink[3],x[i][0]);
    shrink[4] = MAX(shrink[4],x[i][1]);
    shrink[5] = MAX(shrink[5],x[i][2]);
  }

  shrink[3] = -shrink[3]; shrink[4] = -shrink[4]; shrink[5] = -shrink[5];
  MPI_Allreduce(shrink,shrinkall,6,MPI_DOUBLE,MPI_MIN,world);
  shrinkall[3] = -shrinkall[3];
  shrinkall[4] = -shrinkall[4];
  shrinkall[5] = -shrinkall[5];

  double *shrinklo = &shrinkall[0];
  double *shrinkhi = &shrinkall[3];

  // if shrink size in any dim is zero, use box size in that dim

  if (shrinklo[0] == shrinkhi[0]) {
    shrinklo[0] = boxlo[0];
    shrinkhi[0] = boxhi[0];
  }
  if (shrinklo[1] == shrinkhi[1]) {
    shrinklo[1] = boxlo[1];
    shrinkhi[1] = boxhi[1];
  }
  if (shrinklo[2] == shrinkhi[2]) {
    shrinklo[2] = boxlo[2];
    shrinkhi[2] = boxhi[2];
  }

  // invoke RCB
  // then invert() to create list of proc assignments for my atoms
  // if triclinic, RCB operates on lamda coords
  // NOTE: (3/2017) can remove undocumented "old" option at some point
  //       ditto in rcb.cpp, or make it an option

  if (oldrcb) {
    if (wtflag) {
      weight = fixstore->vstore;
      rcb->compute_old(dim,atom->nlocal,atom->x,weight,shrinklo,shrinkhi);
    } else rcb->compute_old(dim,atom->nlocal,atom->x,nullptr,shrinklo,shrinkhi);
  } else {
    if (wtflag) {
      weight = fixstore->vstore;
      rcb->compute(dim,atom->nlocal,atom->x,weight,shrinklo,shrinkhi);
    } else rcb->compute(dim,atom->nlocal,atom->x,nullptr,shrinklo,shrinkhi);
  }

  if (triclinic) domain->lamda2x(nlocal);

  rcb->invert(sortflag);

  // reset RCB lo/hi bounding box to full simulation box as needed

  double *lo = rcb->lo;
  double *hi = rcb->hi;

  if (lo[0] == shrinklo[0]) lo[0] = boxlo[0];
  if (lo[1] == shrinklo[1]) lo[1] = boxlo[1];
  if (lo[2] == shrinklo[2]) lo[2] = boxlo[2];
  if (hi[0] == shrinkhi[0]) hi[0] = boxhi[0];
  if (hi[1] == shrinkhi[1]) hi[1] = boxhi[1];
  if (hi[2] == shrinkhi[2]) hi[2] = boxhi[2];

  // store RCB cut, dim, lo/hi box in CommTiled
  // cut and lo/hi need to be in fractional form so can
  // OK if changes by epsilon from what RCB used since atoms
  //   will subsequently migrate to new owning procs by exchange() anyway
  // ditto for atoms exactly on lo/hi RCB box boundaries due to ties

  comm->rcbnew = 1;

  int idim = rcb->cutdim;
  if (idim >= 0) comm->rcbcutfrac = (rcb->cut - boxlo[idim]) / prd[idim];
  else comm->rcbcutfrac = 0.0;
  comm->rcbcutdim = idim;

  double (*mysplit)[2] = comm->mysplit;

  mysplit[0][0] = (lo[0] - boxlo[0]) / prd[0];
  if (hi[0] == boxhi[0]) mysplit[0][1] = 1.0;
  else mysplit[0][1] = (hi[0] - boxlo[0]) / prd[0];

  mysplit[1][0] = (lo[1] - boxlo[1]) / prd[1];
  if (hi[1] == boxhi[1]) mysplit[1][1] = 1.0;
  else mysplit[1][1] = (hi[1] - boxlo[1]) / prd[1];

  mysplit[2][0] = (lo[2] - boxlo[2]) / prd[2];
  if (hi[2] == boxhi[2]) mysplit[2][1] = 1.0;
  else mysplit[2][1] = (hi[2] - boxlo[2]) / prd[2];

  // return list of procs to send my atoms to

  return rcb->sendproc;
}

/* ----------------------------------------------------------------------
   setup static load balance operations
   called from command and indirectly initially from fix balance
   set rho = 0 for static balancing
------------------------------------------------------------------------- */

void Balance::shift_setup_static(char *str)
{
  shift_allocate = 1;

  memory->create(proccost,nprocs,"balance:proccost");
  memory->create(allproccost,nprocs,"balance:allproccost");

  ndim = strlen(str);
  bdim = new int[ndim];

  for (int i = 0; i < ndim; i++) {
    if (str[i] == 'x') bdim[i] = X;
    if (str[i] == 'y') bdim[i] = Y;
    if (str[i] == 'z') bdim[i] = Z;
  }

  int max = MAX(comm->procgrid[0],comm->procgrid[1]);
  max = MAX(max,comm->procgrid[2]);

  onecost = new double[max];
  allcost = new double[max];
  sum = new double[max+1];
  target = new double[max+1];
  lo = new double[max+1];
  hi = new double[max+1];
  losum = new double[max+1];
  hisum = new double[max+1];

  // if current layout is TILED, set initial uniform splits in Comm
  // this gives starting point to subsequent shift balancing

  if (comm->layout == Comm::LAYOUT_TILED) {
    int *procgrid = comm->procgrid;
    double *xsplit = comm->xsplit;
    double *ysplit = comm->ysplit;
    double *zsplit = comm->zsplit;

    for (int i = 0; i < procgrid[0]; i++) xsplit[i] = i * 1.0/procgrid[0];
    for (int i = 0; i < procgrid[1]; i++) ysplit[i] = i * 1.0/procgrid[1];
    for (int i = 0; i < procgrid[2]; i++) zsplit[i] = i * 1.0/procgrid[2];
    xsplit[procgrid[0]] = ysplit[procgrid[1]] = zsplit[procgrid[2]] = 1.0;
  }

  rho = 0;
}

/* ----------------------------------------------------------------------
   setup shift load balance operations
   called from fix balance
   set rho = 1 to do dynamic balancing after call to shift_setup_static()
------------------------------------------------------------------------- */

void Balance::shift_setup(char *str, int nitermax_in, double thresh_in)
{
  shift_setup_static(str);
  nitermax = nitermax_in;
  stopthresh = thresh_in;
  rho = 1;
}

/* ----------------------------------------------------------------------
   load balance by changing xyz split proc boundaries in Comm
   called one time from input script command or many times from fix balance
   return niter = iteration count
------------------------------------------------------------------------- */

int Balance::shift()
{
  int i,j,k,m,np;
  double mycost,totalcost,boxsize;
  double *split;

  // no balancing if no atoms

  bigint natoms = atom->natoms;
  if (natoms == 0) return 0;

  // set delta for 1d balancing = root of threshold
  // root = # of dimensions being balanced on

  double delta = pow(stopthresh,1.0/ndim) - 1.0;
  int *procgrid = comm->procgrid;

  // all balancing done in lamda coords

  domain->x2lamda(atom->nlocal);

  // loop over dimensions in balance string

  double *prd = domain->prd;

  int niter = 0;
  for (int idim = 0; idim < ndim; idim++) {

    // split = ptr to xyz split in Comm

    if (bdim[idim] == X) {
      split = comm->xsplit;
      boxsize = prd[0];
    } else if (bdim[idim] == Y) {
      split = comm->ysplit;
      boxsize = prd[1];
    } else if (bdim[idim] == Z) {
      split = comm->zsplit;
      boxsize = prd[2];
    } else continue;

    // initial count and sum

    np = procgrid[bdim[idim]];
    tally(bdim[idim],np,split);

    // target[i] = desired sum at split I

    if (wtflag) {
      weight = fixstore->vstore;
      int nlocal = atom->nlocal;
      mycost = 0.0;
      for (i = 0; i < nlocal; i++) mycost += weight[i];
    } else mycost = atom->nlocal;

    MPI_Allreduce(&mycost,&totalcost,1,MPI_DOUBLE,MPI_SUM,world);

    for (i = 0; i < np; i++) target[i] = totalcost/np * i;
    target[np] = totalcost;

    // lo[i] = closest split <= split[i] with a sum <= target
    // hi[i] = closest split >= split[i] with a sum >= target

    lo[0] = hi[0] = 0.0;
    lo[np] = hi[np] = 1.0;
    losum[0] = hisum[0] = 0.0;
    losum[np] = hisum[np] = totalcost;

    for (i = 1; i < np; i++) {
      for (j = i; j >= 0; j--)
        if (sum[j] <= target[i]) {
          lo[i] = split[j];
          losum[i] = sum[j];
          break;
        }
      for (j = i; j <= np; j++)
        if (sum[j] >= target[i]) {
          hi[i] = split[j];
          hisum[i] = sum[j];
          break;
        }
    }

    // iterate until balanced

#ifdef BALANCE_DEBUG
    if (me == 0) debug_shift_output(idim,0,np,split);
#endif

    int doneflag;
    int change = 1;
    for (m = 0; m < nitermax; m++) {
      change = adjust(np,split);
      tally(bdim[idim],np,split);
      niter++;

#ifdef BALANCE_DEBUG
      if (me == 0) debug_shift_output(idim,m+1,np,split);
      if (outflag) dumpout(update->ntimestep);
#endif

      // stop if no change in splits, b/c all targets are met exactly

      if (!change) break;

      // stop if all split sums are within delta of targets
      // this is a 1d test of particle count per slice
      // assumption is that this is sufficient accuracy
      //   for 3d imbalance factor to reach threshold

      doneflag = 1;
      for (i = 1; i < np; i++)
        if (fabs(1.0*(sum[i]-target[i]))/target[i] > delta) doneflag = 0;
      if (doneflag) break;
    }

    // eliminate final adjacent splits that are duplicates
    // can happen if particle distribution is narrow and Nitermax is small
    // set lo = midpt between splits
    // spread duplicates out evenly between bounding midpts with non-duplicates
    // i,j = lo/hi indices of set of duplicate splits
    // delta = new spacing between duplicates
    // bounding midpts = lo[i-1] and lo[j]

    int duplicate = 0;
    for (i = 1; i < np-1; i++)
      if (split[i] == split[i+1]) duplicate = 1;
    if (duplicate) {
      for (i = 0; i < np; i++)
        lo[i] = 0.5 * (split[i] + split[i+1]);
      i = 1;
      while (i < np-1) {
        j = i+1;
        while (split[j] == split[i]) j++;
        j--;
        if (j > i) {
          delta = (lo[j] - lo[i-1]) / (j-i+2);
          for (k = i; k <= j; k++)
            split[k] = lo[i-1] + (k-i+1)*delta;
        }
        i = j+1;
      }
    }

    // adjust adjacent splits that are too close (within neigh skin)
    // do this with minimal adjustment to splits

    double close = (1.0+EPSNEIGH) * neighbor->skin / boxsize;
    double delta,midpt,start,stop,lbound,ubound,spacing;

    i = 0;
    while (i < np) {
      if (split[i+1] - split[i] < close) {
        j = i+1;

        // I,J = set of consecutive splits that are collectively too close
        // if can expand set and not become too close to splits I-1 or J+1, do it
        // else add split I-1 or J+1 to set and try again
        // delta = size of expanded split set that will satisy criterion

        while (1) {
          delta = (j-i) * close;
          midpt = 0.5 * (split[i]+split[j]);
          start = midpt - 0.5*delta;
          stop = midpt + 0.5*delta;

          if (i > 0) lbound = split[i-1] + close;
          else lbound = 0.0;
          if (j < np) ubound = split[j+1] - close;
          else ubound = 1.0;

          // start/stop are within bounds, reset the splits

          if (start >= lbound && stop <= ubound) break;

          // try a shift to either bound, reset the splits if delta fits
          // these tests change start/stop

          if (start < lbound) {
            start = lbound;
            stop = start + delta;
            if (stop <= ubound) break;
          } else if (stop > ubound) {
            stop = ubound;
            start = stop - delta;
            if (start >= lbound) break;
          }

          // delta does not fit between lbound and ubound
          // exit if can't expand set, else expand set
          // if can expand in either direction,
          //   pick new split closest to current midpt of set

          if (i == 0 && j == np) {
            start = 0.0; stop = 1.0;
            break;
          }
          if (i == 0) j++;
          else if (j == np) i--;
          else if (midpt-lbound < ubound-midpt) i--;
          else j++;
        }

        // reset all splits between I,J inclusive to be equi-spaced

        spacing = (stop-start) / (j-i);
        for (m = i; m <= j; m++)
          split[m] = start + (m-i)*spacing;
        if (j == np) split[np] = 1.0;

        // continue testing beyond the J split

        i = j+1;
      } else i++;
    }

    // sanity check on bad duplicate or inverted splits
    // zero or negative width sub-domains will break Comm class
    // should never happen if recursive multisection algorithm is correct

    int bad = 0;
    for (i = 0; i < np; i++)
      if (split[i] >= split[i+1]) bad = 1;
    if (bad) error->all(FLERR,"Balance produced bad splits");

    // stop at this point in bstr if imbalance factor < threshold
    // this is a true 3d test of particle count per processor

    double imbfactor = imbalance_splits();
    if (imbfactor <= stopthresh) break;
  }

  // restore real coords

  domain->lamda2x(atom->nlocal);

  return niter;
}

/* ----------------------------------------------------------------------
   count atoms in each slice, based on their dim coordinate
   N = # of slices
   split = N+1 cuts between N slices
   return updated count = particles per slice
   return updated sum = cumulative count below each of N+1 splits
   use binary search to find which slice each atom is in
------------------------------------------------------------------------- */

void Balance::tally(int dim, int n, double *split)
{
  for (int i = 0; i < n; i++) onecost[i] = 0.0;

  double **x = atom->x;
  int nlocal = atom->nlocal;
  int index;

  if (wtflag) {
    weight = fixstore->vstore;
    for (int i = 0; i < nlocal; i++) {
      index = binary(x[i][dim],n,split);
      onecost[index] += weight[i];
    }
  } else {
    for (int i = 0; i < nlocal; i++) {
      index = binary(x[i][dim],n,split);
      onecost[index] += 1.0;
    }
  }

  MPI_Allreduce(onecost,allcost,n,MPI_DOUBLE,MPI_SUM,world);

  sum[0] = 0.0;
  for (int i = 1; i < n+1; i++)
    sum[i] = sum[i-1] + allcost[i-1];
}

/* ----------------------------------------------------------------------
   adjust cuts between N slices in a dim via recursive multisectioning method
   split = current N+1 cuts, with 0.0 and 1.0 at end points
   sum = cumulative count up to each split
   target = desired cumulative count up to each split
   lo/hi = split values that bound current split
   update lo/hi to reflect sums at current split values
   overwrite split with new cuts
     guaranteed that splits will remain in ascending order,
     though adjacent values may be identical
   recursive bisectioning zooms in on each cut by halving lo/hi
   return 0 if no changes in any splits, b/c they are all perfect
------------------------------------------------------------------------- */

int Balance::adjust(int n, double *split)
{
  int i;
  double fraction;

  // reset lo/hi based on current sum and splits
  // insure lo is monotonically increasing, ties are OK
  // insure hi is monotonically decreasing, ties are OK
  // this effectively uses info from nearby splits
  // to possibly tighten bounds on lo/hi

  for (i = 1; i < n; i++) {
    if (sum[i] <= target[i]) {
      lo[i] = split[i];
      losum[i] = sum[i];
    }
    if (sum[i] >= target[i]) {
      hi[i] = split[i];
      hisum[i] = sum[i];
    }
  }
  for (i = 1; i < n; i++)
    if (lo[i] < lo[i-1]) {
      lo[i] = lo[i-1];
      losum[i] = losum[i-1];
    }
  for (i = n-1; i > 0; i--)
    if (hi[i] > hi[i+1]) {
      hi[i] = hi[i+1];
      hisum[i] = hisum[i+1];
    }

  int change = 0;
  for (int i = 1; i < n; i++)
    if (sum[i] != target[i]) {
      change = 1;
      if (rho == 0) split[i] = 0.5 * (lo[i]+hi[i]);
      else {
        fraction = 1.0*(target[i]-losum[i]) / (hisum[i]-losum[i]);
        split[i] = lo[i] + fraction * (hi[i]-lo[i]);
      }
    }
  return change;
}

/* ----------------------------------------------------------------------
   calculate imbalance based on processor splits in 3 dims
   atoms must be in lamda coords (0-1) before called
   map particles to 3d grid of procs
   return imbalance factor = max load per proc / ave load per proc
------------------------------------------------------------------------- */

double Balance::imbalance_splits()
{
  double *xsplit = comm->xsplit;
  double *ysplit = comm->ysplit;
  double *zsplit = comm->zsplit;

  int nx = comm->procgrid[0];
  int ny = comm->procgrid[1];
  int nz = comm->procgrid[2];

  for (int i = 0; i < nprocs; i++) proccost[i] = 0.0;

  double **x = atom->x;
  int nlocal = atom->nlocal;
  int ix,iy,iz;

  if (wtflag) {
    weight = fixstore->vstore;
    for (int i = 0; i < nlocal; i++) {
      ix = binary(x[i][0],nx,xsplit);
      iy = binary(x[i][1],ny,ysplit);
      iz = binary(x[i][2],nz,zsplit);
      proccost[iz*nx*ny + iy*nx + ix] += weight[i];
    }
  } else {
    for (int i = 0; i < nlocal; i++) {
      ix = binary(x[i][0],nx,xsplit);
      iy = binary(x[i][1],ny,ysplit);
      iz = binary(x[i][2],nz,zsplit);
      proccost[iz*nx*ny + iy*nx + ix] += 1.0;
    }
  }

  // one proc's particles may map to many partitions, so must Allreduce

  MPI_Allreduce(proccost,allproccost,nprocs,MPI_DOUBLE,MPI_SUM,world);

  double maxcost = 0.0;
  double totalcost = 0.0;
  for (int i = 0; i < nprocs; i++) {
    maxcost = MAX(maxcost,allproccost[i]);
    totalcost += allproccost[i];
  }

  double imbalance = 1.0;
  if (maxcost > 0.0) imbalance = maxcost / (totalcost/nprocs);
  return imbalance;
}

/* ----------------------------------------------------------------------
   binary search for where value falls in N-length vec
   note that vec actually has N+1 values, but ignore last one
   values in vec are monotonically increasing, but adjacent values can be ties
   value may be outside range of vec limits
   always return index from 0 to N-1 inclusive
   return 0 if value < vec[0]
   reutrn N-1 if value >= vec[N-1]
   return index = 1 to N-2 inclusive if vec[index] <= value < vec[index+1]
   note that for adjacent tie values, index of lower tie is not returned
     since never satisfies 2nd condition that value < vec[index+1]
------------------------------------------------------------------------- */

int Balance::binary(double value, int n, double *vec)
{
  int lo = 0;
  int hi = n-1;

  if (value < vec[lo]) return lo;
  if (value >= vec[hi]) return hi;

  // insure vec[lo] <= value < vec[hi] at every iteration
  // done when lo,hi are adjacent

  int index = (lo+hi)/2;
  while (lo < hi-1) {
    if (value < vec[index]) hi = index;
    else if (value >= vec[index]) lo = index;
    index = (lo+hi)/2;
  }

  return index;
}

/* ----------------------------------------------------------------------
   write dump snapshot of line segments in Pizza.py mdump mesh format
   write xy lines around each proc's sub-domain for 2d
   write xyz cubes around each proc's sub-domain for 3d
   only called by proc 0
   NOTE: only implemented for orthogonal boxes, not triclinic
------------------------------------------------------------------------- */

void Balance::dumpout(bigint tstep)
{
  int dimension = domain->dimension;
  int triclinic = domain->triclinic;

  // Allgather each proc's sub-box
  // could use Gather, but that requires MPI to alloc memory

  double *lo,*hi;
  if (triclinic == 0) {
    lo = domain->sublo;
    hi = domain->subhi;
  } else {
    lo = domain->sublo_lamda;
    hi = domain->subhi_lamda;
  }

  double box[6];
  box[0] = lo[0]; box[1] = lo[1]; box[2] = lo[2];
  box[3] = hi[0]; box[4] = hi[1]; box[5] = hi[2];

  double **boxall;
  memory->create(boxall,nprocs,6,"balance:dumpout");
  MPI_Allgather(box,6,MPI_DOUBLE,&boxall[0][0],6,MPI_DOUBLE,world);

  if (me) {
    memory->destroy(boxall);
    return;
  }

  // proc 0 writes out nodal coords
  // some will be duplicates

  double *boxlo = domain->boxlo;
  double *boxhi = domain->boxhi;

  fmt::print(fp,"ITEM: TIMESTEP\n{}\n",tstep);
  fprintf(fp,"ITEM: NUMBER OF NODES\n");
  if (dimension == 2) fprintf(fp,"%d\n",4*nprocs);
  else fprintf(fp,"%d\n",8*nprocs);
  fprintf(fp,"ITEM: BOX BOUNDS\n");
  fprintf(fp,"%g %g\n",boxlo[0],boxhi[0]);
  fprintf(fp,"%g %g\n",boxlo[1],boxhi[1]);
  fprintf(fp,"%g %g\n",boxlo[2],boxhi[2]);
  fprintf(fp,"ITEM: NODES\n");

  if (triclinic == 0) {
    if (dimension == 2) {
      int m = 0;
      for (int i = 0; i < nprocs; i++) {
        fprintf(fp,"%d %d %g %g %g\n",m+1,1,boxall[i][0],boxall[i][1],0.0);
        fprintf(fp,"%d %d %g %g %g\n",m+2,1,boxall[i][3],boxall[i][1],0.0);
        fprintf(fp,"%d %d %g %g %g\n",m+3,1,boxall[i][3],boxall[i][4],0.0);
        fprintf(fp,"%d %d %g %g %g\n",m+4,1,boxall[i][0],boxall[i][4],0.0);
        m += 4;
      }
    } else {
      int m = 0;
      for (int i = 0; i < nprocs; i++) {
        fprintf(fp,"%d %d %g %g %g\n",m+1,1,
                boxall[i][0],boxall[i][1],boxall[i][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+2,1,
                boxall[i][3],boxall[i][1],boxall[i][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+3,1,
                boxall[i][3],boxall[i][4],boxall[i][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+4,1,
                boxall[i][0],boxall[i][4],boxall[i][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+5,1,
                boxall[i][0],boxall[i][1],boxall[i][5]);
        fprintf(fp,"%d %d %g %g %g\n",m+6,1,
                boxall[i][3],boxall[i][1],boxall[i][5]);
        fprintf(fp,"%d %d %g %g %g\n",m+7,1,
                boxall[i][3],boxall[i][4],boxall[i][5]);
        fprintf(fp,"%d %d %g %g %g\n",m+8,1,
                boxall[i][0],boxall[i][4],boxall[i][5]);
        m += 8;
      }
    }

  } else {
    double (*bc)[3] = domain->corners;

    if (dimension == 2) {
      int m = 0;
      for (int i = 0; i < nprocs; i++) {
        domain->lamda_box_corners(&boxall[i][0],&boxall[i][3]);
        fprintf(fp,"%d %d %g %g %g\n",m+1,1,bc[0][0],bc[0][1],0.0);
        fprintf(fp,"%d %d %g %g %g\n",m+2,1,bc[1][0],bc[1][1],0.0);
        fprintf(fp,"%d %d %g %g %g\n",m+3,1,bc[2][0],bc[2][1],0.0);
        fprintf(fp,"%d %d %g %g %g\n",m+4,1,bc[3][0],bc[3][1],0.0);
        m += 4;
      }
    } else {
      int m = 0;
      for (int i = 0; i < nprocs; i++) {
        domain->lamda_box_corners(&boxall[i][0],&boxall[i][3]);
        fprintf(fp,"%d %d %g %g %g\n",m+1,1,bc[0][0],bc[0][1],bc[0][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+2,1,bc[1][0],bc[1][1],bc[1][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+3,1,bc[2][0],bc[2][1],bc[2][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+4,1,bc[3][0],bc[3][1],bc[3][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+5,1,bc[4][0],bc[4][1],bc[4][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+6,1,bc[5][0],bc[5][1],bc[5][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+7,1,bc[6][0],bc[6][1],bc[6][2]);
        fprintf(fp,"%d %d %g %g %g\n",m+8,1,bc[7][0],bc[7][1],bc[7][2]);
        m += 8;
      }
    }
  }

  // write out one square/cube per processor for 2d/3d

  fmt::print(fp,"ITEM: TIMESTEP\n{}\n",tstep);
  if (dimension == 2) fprintf(fp,"ITEM: NUMBER OF SQUARES\n");
  else fprintf(fp,"ITEM: NUMBER OF CUBES\n");
  fprintf(fp,"%d\n",nprocs);
  if (dimension == 2) fprintf(fp,"ITEM: SQUARES\n");
  else fprintf(fp,"ITEM: CUBES\n");

  if (dimension == 2) {
    int m = 0;
    for (int i = 0; i < nprocs; i++) {
      fprintf(fp,"%d %d %d %d %d %d\n",i+1,1,m+1,m+2,m+3,m+4);
      m += 4;
    }
  } else {
    int m = 0;
    for (int i = 0; i < nprocs; i++) {
      fprintf(fp,"%d %d %d %d %d %d %d %d %d %d\n",
              i+1,1,m+1,m+2,m+3,m+4,m+5,m+6,m+7,m+8);
      m += 8;
    }
  }

  memory->destroy(boxall);
}

/* ----------------------------------------------------------------------
   debug output for Idim and count
   only called by proc 0
------------------------------------------------------------------------- */

#ifdef BALANCE_DEBUG
void Balance::debug_shift_output(int idim, int m, int np, double *split)
{
  int i;
  const char *dim = nullptr;

  double *boxlo = domain->boxlo;
  double *prd = domain->prd;

  if (bdim[idim] == X) dim = "X";
  else if (bdim[idim] == Y) dim = "Y";
  else if (bdim[idim] == Z) dim = "Z";
  fprintf(stderr,"Dimension %s, Iteration %d\n",dim,m);

  fprintf(stderr,"  Count:");
  for (i = 0; i <= np; i++) fmt::print(stderr," {}",count[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Sum:");
  for (i = 0; i <= np; i++) fmt::print(stderr," {}",sum[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Target:");
  for (i = 0; i <= np; i++) fmt::print(stderr," {}",target[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Actual cut:");
  for (i = 0; i <= np; i++)
    fprintf(stderr," %g",boxlo[bdim[idim]] + split[i]*prd[bdim[idim]]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Split:");
  for (i = 0; i <= np; i++) fprintf(stderr," %g",split[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Low:");
  for (i = 0; i <= np; i++) fprintf(stderr," %g",lo[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Low-sum:");
  for (i = 0; i <= np; i++) fmt::print(stderr," {}",losum[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Hi:");
  for (i = 0; i <= np; i++) fprintf(stderr," %g",hi[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Hi-sum:");
  for (i = 0; i <= np; i++) fmt::print(stderr," {}",hisum[i]);
  fprintf(stderr,"\n");
  fprintf(stderr,"  Delta:");
  for (i = 0; i < np; i++) fprintf(stderr," %g",split[i+1]-split[i]);
  fprintf(stderr,"\n");
}
#endif