diff --git a/src/balance.cpp b/src/balance.cpp new file mode 100644 index 0000000000..458b6df23c --- /dev/null +++ b/src/balance.cpp @@ -0,0 +1,728 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "math.h" +#include "stdlib.h" +#include "string.h" +#include "balance.h" +#include "atom.h" +#include "comm.h" +#include "irregular.h" +#include "domain.h" +#include "force.h" +#include "update.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +enum{NONE,UNIFORM,USER,DYNAMIC}; +enum{X,Y,Z}; +enum{EXPAND,CONTRACT}; + +/* ---------------------------------------------------------------------- */ + +Balance::Balance(LAMMPS *lmp) : Pointers(lmp) +{ + MPI_Comm_rank(world,&me); + MPI_Comm_size(world,&nprocs); + + memory->create(pcount,nprocs,"balance:pcount"); + memory->create(allcount,nprocs,"balance:allcount"); + + user_xsplit = user_ysplit = user_zsplit = NULL; + dflag = 0; + + fp = NULL; +} + +/* ---------------------------------------------------------------------- */ + +Balance::~Balance() +{ + memory->destroy(pcount); + memory->destroy(allcount); + + delete [] user_xsplit; + delete [] user_ysplit; + delete [] user_zsplit; + + if (dflag) { + delete [] bstr; + delete [] ops; + delete [] counts[0]; + delete [] counts[1]; + delete [] counts[2]; + delete [] cuts; + delete [] onecount; + //MPI_Comm_free(&commslice[0]); + //MPI_Comm_free(&commslice[1]); + //MPI_Comm_free(&commslice[2]); + } + + if (fp) fclose(fp); +} + +/* ---------------------------------------------------------------------- + called as balance command in input script +------------------------------------------------------------------------- */ + +void Balance::command(int narg, char **arg) +{ + if (domain->box_exist == 0) + error->all(FLERR,"Balance command before simulation box is defined"); + + if (comm->me == 0 && screen) fprintf(screen,"Balancing ...\n"); + + // parse arguments + + int dimension = domain->dimension; + int *procgrid = comm->procgrid; + xflag = yflag = zflag = NONE; + dflag = 0; + + int iarg = 0; + while (iarg < narg) { + if (strcmp(arg[iarg],"x") == 0) { + if (xflag == DYNAMIC) error->all(FLERR,"Illegal balance command"); + if (strcmp(arg[iarg+1],"uniform") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal balance command"); + xflag = UNIFORM; + iarg += 2; + } else { + if (1 + procgrid[0]-1 > narg) + error->all(FLERR,"Illegal balance command"); + xflag = USER; + delete [] user_xsplit; + user_xsplit = new double[procgrid[0]+1]; + user_xsplit[0] = 0.0; + iarg++; + for (int i = 1; i < procgrid[0]; i++) + user_xsplit[i] = force->numeric(arg[iarg++]); + user_xsplit[procgrid[0]] = 1.0; + } + } else if (strcmp(arg[iarg],"y") == 0) { + if (yflag == DYNAMIC) error->all(FLERR,"Illegal balance command"); + if (strcmp(arg[iarg+1],"uniform") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal balance command"); + yflag = UNIFORM; + iarg += 2; + } else { + if (1 + procgrid[1]-1 > narg) + error->all(FLERR,"Illegal balance command"); + yflag = USER; + delete [] user_ysplit; + user_ysplit = new double[procgrid[1]+1]; + user_ysplit[0] = 0.0; + iarg++; + for (int i = 1; i < procgrid[1]; i++) + user_ysplit[i] = force->numeric(arg[iarg++]); + user_ysplit[procgrid[1]] = 1.0; + } + } else if (strcmp(arg[iarg],"z") == 0) { + if (zflag == DYNAMIC) error->all(FLERR,"Illegal balance command"); + if (strcmp(arg[iarg+1],"uniform") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal balance command"); + zflag = UNIFORM; + iarg += 2; + } else { + if (1 + procgrid[2]-1 > narg) + error->all(FLERR,"Illegal balance command"); + zflag = USER; + delete [] user_zsplit; + user_zsplit = new double[procgrid[2]+1]; + user_zsplit[0] = 0.0; + iarg++; + for (int i = 1; i < procgrid[2]; i++) + user_zsplit[i] = force->numeric(arg[iarg++]); + user_zsplit[procgrid[2]] = 1.0; + } + + } else if (strcmp(arg[iarg],"dynamic") == 0) { + if (xflag != NONE || yflag != NONE || zflag != NONE) + error->all(FLERR,"Illegal balance command"); + if (iarg+5 > narg) error->all(FLERR,"Illegal balance command"); + dflag = 1; + xflag = yflag = DYNAMIC; + if (dimension == 3) zflag = DYNAMIC; + nrepeat = atoi(arg[iarg+1]); + niter = atoi(arg[iarg+2]); + if (nrepeat <= 0 || niter <= 0) + error->all(FLERR,"Illegal balance command"); + int n = strlen(arg[iarg+3]) + 1; + bstr = new char[n]; + strcpy(bstr,arg[iarg+3]); + thresh = atof(arg[iarg+4]); + if (thresh < 1.0) error->all(FLERR,"Illegal balance command"); + iarg += 5; + + } else error->all(FLERR,"Illegal balance command"); + } + + // error check + + if (zflag && dimension == 2) + error->all(FLERR,"Cannot balance in z dimension for 2d simulation"); + + if (xflag == USER) + for (int i = 1; i <= procgrid[0]; i++) + if (user_xsplit[i-1] >= user_xsplit[i]) + error->all(FLERR,"Illegal balance command"); + if (yflag == USER) + for (int i = 1; i <= procgrid[1]; i++) + if (user_ysplit[i-1] >= user_ysplit[i]) + error->all(FLERR,"Illegal balance command"); + if (zflag == USER) + for (int i = 1; i <= procgrid[2]; i++) + if (user_zsplit[i-1] >= user_zsplit[i]) + error->all(FLERR,"Illegal balance command"); + + if (dflag) { + for (int i = 0; i < strlen(bstr); i++) { + if (bstr[i] != 'x' && bstr[i] != 'y' && bstr[i] != 'z') + error->all(FLERR,"Balance dynamic string is invalid"); + if (bstr[i] == 'z' && dimension == 2) + error->all(FLERR,"Balance dynamic string is invalid for 2d simulation"); + } + } + + // insure atoms are in current box & update box via shrink-wrap + // no exchange() since doesn't matter if atoms are assigned to correct procs + + if (domain->triclinic) domain->x2lamda(atom->nlocal); + domain->pbc(); + domain->reset_box(); + if (domain->triclinic) domain->lamda2x(atom->nlocal); + + // imbinit = initial imbalance + // use current splits instead of nlocal since atoms may not be in sub-box + + domain->x2lamda(atom->nlocal); + int maxinit; + double imbinit = imbalance_splits(maxinit); + domain->lamda2x(atom->nlocal); + + // debug output of initial state + + //dumpout(update->ntimestep); + + // explicit setting of sub-domain sizes + + if (xflag == UNIFORM) { + for (int i = 0; i < procgrid[0]; i++) + comm->xsplit[i] = i * 1.0/procgrid[0]; + comm->xsplit[procgrid[0]] = 1.0; + } + + if (yflag == UNIFORM) { + for (int i = 0; i < procgrid[1]; i++) + comm->ysplit[i] = i * 1.0/procgrid[1]; + comm->ysplit[procgrid[1]] = 1.0; + } + + if (zflag == UNIFORM) { + for (int i = 0; i < procgrid[2]; i++) + comm->zsplit[i] = i * 1.0/procgrid[2]; + comm->zsplit[procgrid[2]] = 1.0; + } + + if (xflag == USER) + for (int i = 0; i <= procgrid[0]; i++) comm->xsplit[i] = user_xsplit[i]; + + if (yflag == USER) + for (int i = 0; i <= procgrid[1]; i++) comm->ysplit[i] = user_ysplit[i]; + + if (zflag == USER) + for (int i = 0; i <= procgrid[2]; i++) comm->zsplit[i] = user_zsplit[i]; + + // dynamic load-balance of sub-domain sizes + + if (dflag) { + dynamic_setup(bstr); + dynamic(); + } else count = 0; + + // debug output of final result + + //dumpout(-1); + + // reset comm->uniform flag if necessary + + if (comm->uniform) { + if (xflag == USER || xflag == DYNAMIC) comm->uniform = 0; + if (yflag == USER || yflag == DYNAMIC) comm->uniform = 0; + if (zflag == USER || zflag == DYNAMIC) comm->uniform = 0; + } else { + if (dimension == 3) { + if (xflag == UNIFORM && yflag == UNIFORM && zflag == UNIFORM) + comm->uniform = 1; + } else { + if (xflag == UNIFORM && yflag == UNIFORM) comm->uniform = 1; + } + } + + // reset proc sub-domains + + if (domain->triclinic) domain->set_lamda_box(); + domain->set_local_box(); + + // move atoms to new processors via irregular() + + if (domain->triclinic) domain->x2lamda(atom->nlocal); + Irregular *irregular = new Irregular(lmp); + irregular->migrate_atoms(); + delete irregular; + if (domain->triclinic) domain->lamda2x(atom->nlocal); + + // check if any atoms were lost + + bigint natoms; + bigint nblocal = atom->nlocal; + MPI_Allreduce(&nblocal,&natoms,1,MPI_LMP_BIGINT,MPI_SUM,world); + if (natoms != atom->natoms) { + char str[128]; + sprintf(str,"Lost atoms via balance: original " BIGINT_FORMAT + " current " BIGINT_FORMAT,atom->natoms,natoms); + error->all(FLERR,str); + } + + // imbfinal = final imbalance based on final nlocal + + int maxfinal; + double imbfinal = imbalance_nlocal(maxfinal); + + if (me == 0) { + if (screen) { + fprintf(screen," iteration count = %d\n",count); + fprintf(screen," initial/final max atoms/proc = %d %d\n", + maxinit,maxfinal); + fprintf(screen," initial/final imbalance factor = %g %g\n", + imbinit,imbfinal); + } + if (logfile) { + fprintf(logfile," iteration count = %d\n",count); + fprintf(logfile," initial/final max atoms/proc = %d %d\n", + maxinit,maxfinal); + fprintf(logfile," initial/final imbalance factor = %g %g\n", + imbinit,imbfinal); + } + } + + if (me == 0) { + if (screen) { + fprintf(screen," x cuts:"); + for (int i = 0; i <= comm->procgrid[0]; i++) + fprintf(screen," %g",comm->xsplit[i]); + fprintf(screen,"\n"); + fprintf(screen," y cuts:"); + for (int i = 0; i <= comm->procgrid[1]; i++) + fprintf(screen," %g",comm->ysplit[i]); + fprintf(screen,"\n"); + fprintf(screen," z cuts:"); + for (int i = 0; i <= comm->procgrid[2]; i++) + fprintf(screen," %g",comm->zsplit[i]); + fprintf(screen,"\n"); + } + if (logfile) { + fprintf(logfile," x cuts:"); + for (int i = 0; i <= comm->procgrid[0]; i++) + fprintf(logfile," %g",comm->xsplit[i]); + fprintf(logfile,"\n"); + fprintf(logfile," y cuts:"); + for (int i = 0; i <= comm->procgrid[1]; i++) + fprintf(logfile," %g",comm->ysplit[i]); + fprintf(logfile,"\n"); + fprintf(logfile," z cuts:"); + for (int i = 0; i <= comm->procgrid[2]; i++) + fprintf(logfile," %g",comm->zsplit[i]); + fprintf(logfile,"\n"); + } + } +} + +/* ---------------------------------------------------------------------- + calculate imbalance based on nlocal + return max = max atom per proc + return imbalance factor = max atom per proc / ave atom per proc +------------------------------------------------------------------------- */ + +double Balance::imbalance_nlocal(int &max) +{ + MPI_Allreduce(&atom->nlocal,&max,1,MPI_INT,MPI_MAX,world); + double imbalance = max / (1.0 * atom->natoms / nprocs); + return imbalance; +} + +/* ---------------------------------------------------------------------- + calculate imbalance based on processor splits in 3 dims + atoms must be in lamda coords (0-1) before called + map atoms to 3d grid of procs + return max = max atom per proc + return imbalance factor = max atom per proc / ave atom per proc +------------------------------------------------------------------------- */ + +double Balance::imbalance_splits(int &max) +{ + double *xsplit = comm->xsplit; + double *ysplit = comm->ysplit; + double *zsplit = comm->zsplit; + + int nx = comm->procgrid[0]; + int ny = comm->procgrid[1]; + int nz = comm->procgrid[2]; + + for (int i = 0; i < nprocs; i++) pcount[i] = 0; + + double **x = atom->x; + int nlocal = atom->nlocal; + int ix,iy,iz; + + for (int i = 0; i < nlocal; i++) { + ix = binary(x[i][0],nx,xsplit); + iy = binary(x[i][1],ny,ysplit); + iz = binary(x[i][2],nz,zsplit); + pcount[iz*nx*ny + iy*nx + ix]++; + } + + MPI_Allreduce(pcount,allcount,nprocs,MPI_INT,MPI_SUM,world); + max = 0; + for (int i = 0; i < nprocs; i++) max = MAX(max,allcount[i]); + double imbalance = max / (1.0 * atom->natoms / nprocs); + return imbalance; +} + +/* ---------------------------------------------------------------------- + setup dynamic load balance operations + called from command & fix balance +------------------------------------------------------------------------- */ + +void Balance::dynamic_setup(char *str) +{ + nops = strlen(str); + ops = new int[nops]; + + for (int i = 0; i < strlen(str); i++) { + if (str[i] == 'x') ops[i] = X; + if (str[i] == 'y') ops[i] = Y; + if (str[i] == 'z') ops[i] = Z; + } + + splits[0] = comm->xsplit; + splits[1] = comm->ysplit; + splits[2] = comm->zsplit; + + counts[0] = new bigint[comm->procgrid[0]]; + counts[1] = new bigint[comm->procgrid[1]]; + counts[2] = new bigint[comm->procgrid[2]]; + + int max = MAX(comm->procgrid[0],comm->procgrid[1]); + max = MAX(max,comm->procgrid[2]); + cuts = new double[max+1]; + onecount = new bigint[max]; + + //MPI_Comm_split(world,comm->myloc[0],0,&commslice[0]); + //MPI_Comm_split(world,comm->myloc[1],0,&commslice[1]); + //MPI_Comm_split(world,comm->myloc[2],0,&commslice[2]); +} + +/* ---------------------------------------------------------------------- + perform dynamic load balance by changing xyz split proc boundaries in Comm + called from command and fix balance +------------------------------------------------------------------------- */ + +void Balance::dynamic() +{ + int i,m,max; + double imbfactor; + + int *procgrid = comm->procgrid; + + domain->x2lamda(atom->nlocal); + + count = 0; + for (int irepeat = 0; irepeat < nrepeat; irepeat++) { + for (i = 0; i < nops; i++) { + for (m = 0; m < niter; m++) { + stats(ops[i],procgrid[ops[i]],splits[ops[i]],counts[ops[i]]); + adjust(procgrid[ops[i]],counts[ops[i]],splits[ops[i]]); + count++; + // debug output of intermediate result + //dumpout(-1); + } + imbfactor = imbalance_splits(max); + if (imbfactor <= thresh) break; + } + if (i < nops) break; + } + + domain->lamda2x(atom->nlocal); +} + +/* ---------------------------------------------------------------------- + count atoms in each slice + current cuts may be very different than original cuts, + so use binary search to find which slice each atom is in +------------------------------------------------------------------------- */ + +void Balance::stats(int dim, int n, double *split, bigint *count) +{ + for (int i = 0; i < n; i++) onecount[i] = 0; + + double **x = atom->x; + int nlocal = atom->nlocal; + int index; + + for (int i = 0; i < nlocal; i++) { + index = binary(x[i][dim],n,split); + onecount[index]++; + } + + MPI_Allreduce(onecount,count,n,MPI_LMP_BIGINT,MPI_SUM,world); +} + +/* ---------------------------------------------------------------------- + adjust cuts between N slices in a dim via diffusive method + count = atoms per slice + split = current N+1 cuts, with 0.0 and 1.0 at end points + overwrite split with new cuts + diffusion means slices with more atoms than their neighbors + "send" them atoms by moving cut closer to sender, further from receiver +------------------------------------------------------------------------- */ + +void Balance::adjust(int n, bigint *count, double *split) +{ + // damping factor + + double damp = 0.5; + + // loop over slices from 1 to N-2 inclusive (not end slices 0 and N-1) + // cut I is between 2 slices (I-1 and I) with counts + // cut I+1 is between 2 slices (I and I+1) with counts + // for a cut between 2 slices, only slice with larger count adjusts it + + bigint leftcount,mycount,rightcount; + double rho,target,targetleft,targetright; + + for (int i = 1; i < n-1; i++) { + leftcount = count[i-1]; + mycount = count[i]; + rightcount = count[i+1]; + + // middle slice is <= both left and right, so do nothing + // special case if 2 slices both have count = 0, no change in cut + + if (mycount <= leftcount && mycount <= rightcount) { + if (leftcount == 0) cuts[i] = split[i]; + if (rightcount == 0) cuts[i+1] = split[i+1]; + continue; + } + + // rho = density of atoms in the slice + + rho = mycount / (split[i+1] - split[i]); + + // middle slice has more atoms then left or right slice + // if delta from middle to top slice > delta between top and bottom slice + // then send atoms both dirs to bring all 3 slices to same count + // else bottom slice is very low, so send atoms only in that dir + + if (mycount > leftcount && mycount > rightcount) { + if (mycount-MAX(leftcount,rightcount) >= fabs(leftcount-rightcount)) { + if (leftcount <= rightcount) { + targetleft = damp * + (rightcount-leftcount + (mycount-rightcount)/3.0); + targetright = damp * (mycount-rightcount)/3.0; + cuts[i] = split[i] + targetleft/rho; + cuts[i+1] = split[i+1] - targetright/rho; + } else { + targetleft = damp * (mycount-leftcount)/3.0; + targetright = damp * + (leftcount-rightcount + (mycount-leftcount)/3.0); + cuts[i] = split[i] + targetleft/rho; + cuts[i+1] = split[i+1] - targetright/rho; + } + } else if (leftcount < rightcount) { + target = damp * 0.5*(mycount-leftcount); + cuts[i] = split[i] + target/rho; + cuts[i+1] = split[i+1]; + } else if (rightcount < leftcount) { + target = damp * 0.5*(mycount-rightcount); + cuts[i+1] = split[i+1] - target/rho; + cuts[i] = split[i]; + } + + // middle slice has more atoms than only left or right slice + // send atoms only in that dir + + } else if (mycount > leftcount) { + target = damp * 0.5*(mycount-leftcount); + cuts[i] = split[i] + target/rho; + } else if (mycount > rightcount) { + target = damp * 0.5*(mycount-rightcount); + cuts[i+1] = split[i+1] - target/rho; + } + } + + // overwrite adjustable splits with new cuts + + for (int i = 1; i < n; i++) split[i] = cuts[i]; +} + +/* ---------------------------------------------------------------------- + binary search for value in N-length ascending vec + value may be outside range of vec limits + always return index from 0 to N-1 inclusive + return 0 if value < vec[0] + reutrn N-1 if value >= vec[N-1] + return index = 1 to N-2 if vec[index] <= value < vec[index+1] +------------------------------------------------------------------------- */ + +int Balance::binary(double value, int n, double *vec) +{ + int lo = 0; + int hi = n-1; + + if (value < vec[lo]) return lo; + if (value >= vec[hi]) return hi; + + // insure vec[lo] <= value < vec[hi] at every iteration + // done when lo,hi are adjacent + + int index = (lo+hi)/2; + while (lo < hi-1) { + if (value < vec[index]) hi = index; + else if (value >= vec[index]) lo = index; + index = (lo+hi)/2; + } + + return index; +} + +/* ---------------------------------------------------------------------- + create dump file of line segments in Pizza.py mdump mesh format + just for debug/viz purposes + write to tmp.mdump.*, open first time if necessary + write xy lines around each proc's sub-domain for 2d + write xyz cubes around each proc's sub-domain for 3d + all procs but 0 just return +------------------------------------------------------------------------- */ + +void Balance::dumpout(bigint tstamp) +{ + if (me) return; + + bigint tstep; + if (tstamp >= 0) tstep = tstamp; + else tstep = laststep + 1; + laststep = tstep; + + if (fp == NULL) { + char file[32]; + sprintf(file,"tmp.mdump.%ld",tstep); + fp = fopen(file,"w"); + if (!fp) error->one(FLERR,"Cannot open balance output file"); + + // write out one square/cute per processor for 2d/3d + // only write once since topology is static + + fprintf(fp,"ITEM: TIMESTEP\n"); + fprintf(fp,"%ld\n",tstep); + fprintf(fp,"ITEM: NUMBER OF SQUARES\n"); + fprintf(fp,"%d\n",nprocs); + fprintf(fp,"ITEM: SQUARES\n"); + + int nx = comm->procgrid[0] + 1; + int ny = comm->procgrid[1] + 1; + int nz = comm->procgrid[2] + 1; + + if (domain->dimension == 2) { + int m = 0; + for (int j = 0; j < comm->procgrid[1]; j++) + for (int i = 0; i < comm->procgrid[0]; i++) { + int c1 = j*nx + i + 1; + int c2 = c1 + 1; + int c3 = c2 + nx; + int c4 = c3 - 1; + fprintf(fp,"%d %d %d %d %d %d\n",m+1,m+1,c1,c2,c3,c4); + m++; + } + + } else { + int m = 0; + for (int k = 0; k < comm->procgrid[2]; k++) + for (int j = 0; j < comm->procgrid[1]; j++) + for (int i = 0; i < comm->procgrid[0]; i++) { + int c1 = k*ny*nx + j*nx + i + 1; + int c2 = c1 + 1; + int c3 = c2 + nx; + int c4 = c3 - 1; + int c5 = c1 + ny*nx; + int c6 = c2 + ny*nx; + int c7 = c3 + ny*nx; + int c8 = c4 + ny*nx; + fprintf(fp,"%d %d %d %d %d %d %d %d %d %d\n", + m+1,m+1,c1,c2,c3,c4,c5,c6,c7,c8); + m++; + } + } + } + + // write out nodal coords, can be different every call + // scale xsplit,ysplit,zsplit values to full box + // only implmented for orthogonal boxes, not triclinic + + int nx = comm->procgrid[0] + 1; + int ny = comm->procgrid[1] + 1; + int nz = comm->procgrid[2] + 1; + + double *boxlo = domain->boxlo; + double *boxhi = domain->boxhi; + double *prd = domain->prd; + + fprintf(fp,"ITEM: TIMESTEP\n"); + fprintf(fp,"%ld\n",tstep); + fprintf(fp,"ITEM: NUMBER OF NODES\n"); + if (domain->dimension == 2) + fprintf(fp,"%d\n",nx*ny); + else + fprintf(fp,"%d\n",nx*ny*nz); + fprintf(fp,"ITEM: BOX BOUNDS\n"); + fprintf(fp,"%g %g\n",boxlo[0],boxhi[0]); + fprintf(fp,"%g %g\n",boxlo[1],boxhi[1]); + fprintf(fp,"%g %g\n",boxlo[2],boxhi[2]); + fprintf(fp,"ITEM: NODES\n"); + + if (domain->dimension == 2) { + int m = 0; + for (int j = 0; j < ny; j++) + for (int i = 0; i < nx; i++) { + fprintf(fp,"%d %d %g %g %g\n",m+1,1, + boxlo[0] + prd[0]*comm->xsplit[i], + boxlo[1] + prd[1]*comm->ysplit[j], + 0.0); + m++; + } + } else { + int m = 0; + for (int k = 0; k < nz; k++) + for (int j = 0; j < ny; j++) + for (int i = 0; i < nx; i++) { + fprintf(fp,"%d %d %g %g %g\n",m+1,1, + boxlo[0] + prd[0]*comm->xsplit[i], + boxlo[1] + prd[1]*comm->ysplit[j], + boxlo[2] + prd[2]*comm->zsplit[j]); + m++; + } + } +} diff --git a/src/balance.h b/src/balance.h new file mode 100644 index 0000000000..7474fd349f --- /dev/null +++ b/src/balance.h @@ -0,0 +1,71 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMMAND_CLASS + +CommandStyle(balance,Balance) + +#else + +#ifndef LMP_BALANCE_H +#define LMP_BALANCE_H + +#include "mpi.h" +#include "stdio.h" +#include "pointers.h" + +namespace LAMMPS_NS { + +class Balance : protected Pointers { + public: + Balance(class LAMMPS *); + ~Balance(); + void command(int, char **); + + private: + int me,nprocs; + int xflag,yflag,zflag,dflag; + int nrepeat,niter; + double thresh; + char *bstr; + double *user_xsplit,*user_ysplit,*user_zsplit; + + int *ops; + int nops; + double *splits[3]; + bigint *counts[3]; + double *cuts; + bigint *onecount; + MPI_Comm commslice[3]; + + int count; + int *pcount,*allcount; + + FILE *fp; // for debug output + bigint laststep; + + double imbalance_splits(int &); + double imbalance_nlocal(int &); + void dynamic_setup(char *); + void dynamic(); + void stats(int, int, double *, bigint *); + void adjust(int, bigint *, double *); + int binary(double, int, double *); + + void dumpout(bigint); // for debug output +}; + +} + +#endif +#endif diff --git a/src/change_box.cpp b/src/change_box.cpp index da223074f1..a47be36665 100644 --- a/src/change_box.cpp +++ b/src/change_box.cpp @@ -150,13 +150,13 @@ void ChangeBox::command(int narg, char **arg) iarg += 4; } else if (strcmp(arg[iarg],"ortho") == 0) { - if (iarg+2 > narg) error->all(FLERR,"Illegal change_box command"); + if (iarg+1 > narg) error->all(FLERR,"Illegal change_box command"); ops[nops].style = ORTHO; nops++; iarg += 1; } else if (strcmp(arg[iarg],"triclinic") == 0) { - if (iarg+2 > narg) error->all(FLERR,"Illegal change_box command"); + if (iarg+1 > narg) error->all(FLERR,"Illegal change_box command"); ops[nops].style = TRICLINIC; nops++; iarg += 1; diff --git a/src/comm.cpp b/src/comm.cpp index 34ccb9673c..49ba0cad71 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -78,6 +78,8 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp) bordergroup = 0; style = SINGLE; + uniform = 1; + xsplit = ysplit = zsplit = NULL; multilo = multihi = NULL; cutghostmulti = NULL; cutghostuser = 0.0; @@ -124,6 +126,10 @@ Comm::Comm(LAMMPS *lmp) : Pointers(lmp) Comm::~Comm() { + memory->destroy(xsplit); + memory->destroy(ysplit); + memory->destroy(zsplit); + delete [] customfile; delete [] outfile; @@ -145,7 +151,7 @@ Comm::~Comm() /* ---------------------------------------------------------------------- create 3d grid of procs based on Nprocs and box size & shape - map processors to grid + map processors to grid, setup xyz split for a uniform grid ------------------------------------------------------------------------- */ void Comm::set_proc_grid() @@ -252,14 +258,31 @@ void Comm::set_proc_grid() if (outfile) pmap->output(outfile,procgrid,grid2proc); - // set lamda box params after procs are assigned - - if (domain->triclinic) domain->set_lamda_box(); - // free ProcMap class delete pmap; + // set xsplit,ysplit,zsplit for uniform spacings + + memory->destroy(xsplit); + memory->destroy(ysplit); + memory->destroy(zsplit); + + memory->create(xsplit,procgrid[0]+1,"comm:xsplit"); + memory->create(ysplit,procgrid[1]+1,"comm:ysplit"); + memory->create(zsplit,procgrid[2]+1,"comm:zsplit"); + + for (int i = 0; i < procgrid[0]; i++) xsplit[i] = i * 1.0/procgrid[0]; + for (int i = 0; i < procgrid[1]; i++) ysplit[i] = i * 1.0/procgrid[1]; + for (int i = 0; i < procgrid[2]; i++) zsplit[i] = i * 1.0/procgrid[2]; + + xsplit[procgrid[0]] = ysplit[procgrid[1]] = zsplit[procgrid[2]] = 1.0; + + // set lamda box params after procs are assigned + // only set once unless load-balancing occurs + + if (domain->triclinic) domain->set_lamda_box(); + // send my 3d proc grid to another partition if requested if (send_to_partition >= 0) { @@ -399,25 +422,112 @@ void Comm::setup() } } - // need = # of procs I need atoms from in each dim based on max cutoff - // for 2d, don't communicate in z - - need[0] = static_cast (cutghost[0] * procgrid[0] / prd[0]) + 1; - need[1] = static_cast (cutghost[1] * procgrid[1] / prd[1]) + 1; - need[2] = static_cast (cutghost[2] * procgrid[2] / prd[2]) + 1; - if (domain->dimension == 2) need[2] = 0; - - // if non-periodic, do not communicate further than procgrid-1 away - // this enables very large cutoffs in non-periodic systems + // recvneed[idim][0/1] = # of procs away I recv atoms from, within cutghost + // 0 = from left, 1 = from right + // do not cross non-periodic boundaries, need[2] = 0 for 2d + // sendneed[idim][0/1] = # of procs away I send atoms to + // 0 = to left, 1 = to right + // set equal to recvneed[idim][1/0] of neighbor proc + // maxneed[idim] = max procs away any proc recvs atoms in either direction + // uniform = 1 = uniform sized sub-domains: + // maxneed is directly computable from sub-domain size + // limit to procgrid-1 for non-PBC + // recvneed = maxneed except for procs near non-PBC + // sendneed = recvneed of neighbor on each side + // uniform = 0 = non-uniform sized sub-domains: + // compute recvneed via updown() which accounts for non-PBC + // sendneed = recvneed of neighbor on each side + // maxneed via Allreduce() of recvneed int *periodicity = domain->periodicity; - if (periodicity[0] == 0) need[0] = MIN(need[0],procgrid[0]-1); - if (periodicity[1] == 0) need[1] = MIN(need[1],procgrid[1]-1); - if (periodicity[2] == 0) need[2] = MIN(need[2],procgrid[2]-1); + int left,right; + + if (uniform) { + maxneed[0] = static_cast (cutghost[0] * procgrid[0] / prd[0]) + 1; + maxneed[1] = static_cast (cutghost[1] * procgrid[1] / prd[1]) + 1; + maxneed[2] = static_cast (cutghost[2] * procgrid[2] / prd[2]) + 1; + if (domain->dimension == 2) maxneed[2] = 0; + if (!periodicity[0]) maxneed[0] = MIN(maxneed[0],procgrid[0]-1); + if (!periodicity[1]) maxneed[1] = MIN(maxneed[1],procgrid[1]-1); + if (!periodicity[2]) maxneed[2] = MIN(maxneed[2],procgrid[2]-1); + + if (!periodicity[0]) { + recvneed[0][0] = MIN(maxneed[0],myloc[0]); + recvneed[0][1] = MIN(maxneed[0],procgrid[0]-myloc[0]-1); + left = myloc[0] - 1; + if (left < 0) left = procgrid[0] - 1; + sendneed[0][0] = MIN(maxneed[0],procgrid[0]-left-1); + right = myloc[0] + 1; + if (right == procgrid[0]) right = 0; + sendneed[0][1] = MIN(maxneed[0],right); + } else recvneed[0][0] = recvneed[0][1] = + sendneed[0][0] = sendneed[0][1] = maxneed[0]; + + if (!periodicity[1]) { + recvneed[1][0] = MIN(maxneed[1],myloc[1]); + recvneed[1][1] = MIN(maxneed[1],procgrid[1]-myloc[1]-1); + left = myloc[1] - 1; + if (left < 0) left = procgrid[1] - 1; + sendneed[1][0] = MIN(maxneed[1],procgrid[1]-left-1); + right = myloc[1] + 1; + if (right == procgrid[1]) right = 0; + sendneed[1][1] = MIN(maxneed[1],right); + } else recvneed[1][0] = recvneed[1][1] = + sendneed[1][0] = sendneed[1][1] = maxneed[1]; + + if (!periodicity[2]) { + recvneed[2][0] = MIN(maxneed[2],myloc[2]); + recvneed[2][1] = MIN(maxneed[2],procgrid[2]-myloc[2]-1); + left = myloc[2] - 1; + if (left < 0) left = procgrid[2] - 1; + sendneed[2][0] = MIN(maxneed[2],procgrid[2]-left-1); + right = myloc[2] + 1; + if (right == procgrid[2]) right = 0; + sendneed[2][1] = MIN(maxneed[2],right); + } else recvneed[2][0] = recvneed[2][1] = + sendneed[2][0] = sendneed[2][1] = maxneed[2]; + + } else { + recvneed[0][0] = updown(0,0,myloc[0],prd[0],periodicity[0],xsplit); + recvneed[0][1] = updown(0,1,myloc[0],prd[0],periodicity[0],xsplit); + left = myloc[0] - 1; + if (left < 0) left = procgrid[0] - 1; + sendneed[0][0] = updown(0,1,left,prd[0],periodicity[0],xsplit); + right = myloc[0] + 1; + if (right == procgrid[0]) right = 0; + sendneed[0][1] = updown(0,0,right,prd[0],periodicity[0],xsplit); + + recvneed[1][0] = updown(1,0,myloc[1],prd[1],periodicity[1],ysplit); + recvneed[1][1] = updown(1,1,myloc[1],prd[1],periodicity[1],ysplit); + left = myloc[1] - 1; + if (left < 0) left = procgrid[1] - 1; + sendneed[1][0] = updown(1,1,left,prd[1],periodicity[1],ysplit); + right = myloc[1] + 1; + if (right == procgrid[1]) right = 0; + sendneed[1][1] = updown(1,0,right,prd[1],periodicity[1],ysplit); + + if (domain->dimension == 3) { + recvneed[2][0] = updown(2,0,myloc[2],prd[2],periodicity[2],zsplit); + recvneed[2][1] = updown(2,1,myloc[2],prd[2],periodicity[2],zsplit); + left = myloc[2] - 1; + if (left < 0) left = procgrid[2] - 1; + sendneed[2][0] = updown(2,1,left,prd[2],periodicity[2],zsplit); + right = myloc[2] + 1; + if (right == procgrid[2]) right = 0; + sendneed[2][1] = updown(2,0,right,prd[2],periodicity[2],zsplit); + } else recvneed[2][0] = recvneed[2][1] = + sendneed[2][0] = sendneed[2][1] = 0; + + int all[6]; + MPI_Allreduce(&recvneed[0][0],all,6,MPI_INT,MPI_MAX,world); + maxneed[0] = MAX(all[0],all[1]); + maxneed[1] = MAX(all[2],all[3]); + maxneed[2] = MAX(all[4],all[5]); + } // allocate comm memory - nswap = 2 * (need[0]+need[1]+need[2]); + nswap = 2 * (maxneed[0]+maxneed[1]+maxneed[2]); if (nswap > maxswap) grow_swap(nswap); // setup parameters for each exchange: @@ -428,13 +538,11 @@ void Comm::setup() // use -BIG/midpt/BIG to insure all atoms included even if round-off occurs // if round-off, atoms recvd across PBC can be < or > than subbox boundary // note that borders() only loops over subset of atoms during each swap - // set slablo > slabhi for swaps across non-periodic boundaries - // this insures no atoms are swapped - // only for procs owning sub-box at non-periodic end of global box + // treat all as PBC here, non-PBC is handled in borders() via r/s need[][] // for style MULTI: - // multilo/multihi is same as slablo/slabhi, only for each atom type + // multilo/multihi is same, with slablo/slabhi for each atom type // pbc_flag: 0 = nothing across a boundary, 1 = something across a boundary - // pbc = -1/0/1 for PBC factor in each of 3/6 orthog/triclinic dirs + // pbc = -1/0/1 for PBC factor in each of 3/6 orthogonal/triclinic dirs // for triclinic, slablo/hi and pbc_border will be used in lamda (0-1) coords // 1st part of if statement is sending to the west/south/down // 2nd part of if statement is sending to the east/north/up @@ -443,7 +551,7 @@ void Comm::setup() int iswap = 0; for (dim = 0; dim < 3; dim++) { - for (ineed = 0; ineed < 2*need[dim]; ineed++) { + for (ineed = 0; ineed < 2*maxneed[dim]; ineed++) { pbc_flag[iswap] = 0; pbc[iswap][0] = pbc[iswap][1] = pbc[iswap][2] = pbc[iswap][3] = pbc[iswap][4] = pbc[iswap][5] = 0; @@ -463,18 +571,11 @@ void Comm::setup() } } if (myloc[dim] == 0) { - if (periodicity[dim] == 0) { - if (style == SINGLE) slabhi[iswap] = slablo[iswap] - 1.0; - else - for (i = 1; i <= ntypes; i++) - multihi[iswap][i] = multilo[iswap][i] - 1.0; - } else { - pbc_flag[iswap] = 1; - pbc[iswap][dim] = 1; - if (triclinic) { - if (dim == 1) pbc[iswap][5] = 1; - else if (dim == 2) pbc[iswap][4] = pbc[iswap][3] = 1; - } + pbc_flag[iswap] = 1; + pbc[iswap][dim] = 1; + if (triclinic) { + if (dim == 1) pbc[iswap][5] = 1; + else if (dim == 2) pbc[iswap][4] = pbc[iswap][3] = 1; } } @@ -493,18 +594,11 @@ void Comm::setup() } } if (myloc[dim] == procgrid[dim]-1) { - if (periodicity[dim] == 0) { - if (style == SINGLE) slabhi[iswap] = slablo[iswap] - 1.0; - else - for (i = 1; i <= ntypes; i++) - multihi[iswap][i] = multilo[iswap][i] - 1.0; - } else { - pbc_flag[iswap] = 1; - pbc[iswap][dim] = -1; - if (triclinic) { - if (dim == 1) pbc[iswap][5] = -1; - else if (dim == 2) pbc[iswap][4] = pbc[iswap][3] = -1; - } + pbc_flag[iswap] = 1; + pbc[iswap][dim] = -1; + if (triclinic) { + if (dim == 1) pbc[iswap][5] = -1; + else if (dim == 2) pbc[iswap][4] = pbc[iswap][3] = -1; } } } @@ -514,6 +608,55 @@ void Comm::setup() } } +/* ---------------------------------------------------------------------- + walk up/down the extent of nearby processors in dim and dir + loc = myloc of proc to start at + dir = 0/1 = walk to left/right + do not cross non-periodic boundaries + is not called for z dim in 2d + return how many procs away are needed to encompass cutghost away from loc +------------------------------------------------------------------------- */ + +int Comm::updown(int dim, int dir, int loc, + double prd, int periodicity, double *split) +{ + int index,count; + double frac,delta; + + if (dir == 0) { + frac = cutghost[dim]/prd; + index = loc - 1; + delta = 0.0; + count = 0; + while (delta < frac) { + if (index < 0) { + if (!periodicity) break; + index = procgrid[dim] - 1; + } + count++; + delta += split[index+1] - split[index]; + index--; + } + + } else { + frac = cutghost[dim]/prd; + index = loc + 1; + delta = 0.0; + count = 0; + while (delta < frac) { + if (index >= procgrid[dim]) { + if (!periodicity) break; + index = 0; + } + count++; + delta += split[index+1] - split[index]; + index++; + } + } + + return count; +} + /* ---------------------------------------------------------------------- forward communication of atom coords every timestep other per-atom attributes may also be sent via pack/unpack routines @@ -537,27 +680,30 @@ void Comm::forward_comm(int dummy) if (comm_x_only) { if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; else buf = NULL; - MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); + if (size_forward_recv[iswap]) + MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); n = avec->pack_comm(sendnum[iswap],sendlist[iswap], buf_send,pbc_flag[iswap],pbc[iswap]); - MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + if (size_forward_recv[iswap]) MPI_Wait(&request,&status); } else if (ghost_velocity) { - MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); + if (size_forward_recv[iswap]) + MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], buf_send,pbc_flag[iswap],pbc[iswap]); - MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + if (size_forward_recv[iswap]) MPI_Wait(&request,&status); avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv); } else { - MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); + if (size_forward_recv[iswap]) + MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); n = avec->pack_comm(sendnum[iswap],sendlist[iswap], buf_send,pbc_flag[iswap],pbc[iswap]); - MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + if (size_forward_recv[iswap]) MPI_Wait(&request,&status); avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv); } @@ -601,19 +747,22 @@ void Comm::reverse_comm() for (int iswap = nswap-1; iswap >= 0; iswap--) { if (sendproc[iswap] != me) { if (comm_f_only) { - MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, - sendproc[iswap],0,world,&request); + if (size_reverse_recv[iswap]) + MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); if (size_reverse_send[iswap]) buf = f[firstrecv[iswap]]; else buf = NULL; - MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, - recvproc[iswap],0,world); - MPI_Wait(&request,&status); + if (size_reverse_send[iswap]) + MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, + recvproc[iswap],0,world); + if (size_reverse_recv[iswap]) MPI_Wait(&request,&status); } else { - MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, - sendproc[iswap],0,world,&request); + if (size_reverse_recv[iswap]) + MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send); - MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world); - MPI_Wait(&request,&status); + if (n) MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world); + if (size_reverse_recv[iswap]) MPI_Wait(&request,&status); } avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv); @@ -752,8 +901,8 @@ void Comm::exchange() void Comm::borders() { - int i,n,itype,iswap,dim,ineed,maxneed,smax,rmax; - int nsend,nrecv,nfirst,nlast,ngroup; + int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax; + int nsend,nrecv,sendflag,nfirst,nlast,ngroup; double lo,hi; int *type; double **x; @@ -774,8 +923,8 @@ void Comm::borders() for (dim = 0; dim < 3; dim++) { nlast = 0; - maxneed = 2*need[dim]; - for (ineed = 0; ineed < maxneed; ineed++) { + twoneed = 2*maxneed[dim]; + for (ineed = 0; ineed < twoneed; ineed++) { // find atoms within slab boundaries lo/hi using <= and >= // check atoms between nfirst and nlast @@ -799,55 +948,64 @@ void Comm::borders() nsend = 0; + // sendflag = 0 if I do not send on this swap + // sendneed test indicates receiver no longer requires data + // e.g. due to non-PBC or non-uniform sub-domains + + if (ineed/2 >= sendneed[dim][ineed % 2]) sendflag = 0; + else sendflag = 1; + // find send atoms according to SINGLE vs MULTI // all atoms eligible versus atoms in bordergroup // only need to limit loop to bordergroup for first sends (ineed < 2) // on these sends, break loop in two: owned (in group) and ghost - if (!bordergroup || ineed >= 2) { - if (style == SINGLE) { - for (i = nfirst; i < nlast; i++) - if (x[i][dim] >= lo && x[i][dim] <= hi) { - if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); - sendlist[iswap][nsend++] = i; - } - } else { - for (i = nfirst; i < nlast; i++) { - itype = type[i]; - if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { - if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); - sendlist[iswap][nsend++] = i; + if (sendflag) { + if (!bordergroup || ineed >= 2) { + if (style == SINGLE) { + for (i = nfirst; i < nlast; i++) + if (x[i][dim] >= lo && x[i][dim] <= hi) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + } else { + for (i = nfirst; i < nlast; i++) { + itype = type[i]; + if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } } } - } - - } else { - if (style == SINGLE) { - ngroup = atom->nfirst; - for (i = 0; i < ngroup; i++) - if (x[i][dim] >= lo && x[i][dim] <= hi) { - if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); - sendlist[iswap][nsend++] = i; - } - for (i = atom->nlocal; i < nlast; i++) - if (x[i][dim] >= lo && x[i][dim] <= hi) { - if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); - sendlist[iswap][nsend++] = i; - } + } else { - ngroup = atom->nfirst; - for (i = 0; i < ngroup; i++) { - itype = type[i]; - if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { - if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); - sendlist[iswap][nsend++] = i; + if (style == SINGLE) { + ngroup = atom->nfirst; + for (i = 0; i < ngroup; i++) + if (x[i][dim] >= lo && x[i][dim] <= hi) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + for (i = atom->nlocal; i < nlast; i++) + if (x[i][dim] >= lo && x[i][dim] <= hi) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + } else { + ngroup = atom->nfirst; + for (i = 0; i < ngroup; i++) { + itype = type[i]; + if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } } - } - for (i = atom->nlocal; i < nlast; i++) { - itype = type[i]; - if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { - if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); - sendlist[iswap][nsend++] = i; + for (i = atom->nlocal; i < nlast; i++) { + itype = type[i]; + if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } } } } @@ -865,18 +1023,18 @@ void Comm::borders() pbc_flag[iswap],pbc[iswap]); // swap atoms with other proc + // no MPI calls except SendRecv if nsend/nrecv = 0 // put incoming ghosts at end of my atom arrays // if swapping with self, simply copy, no messages if (sendproc[iswap] != me) { MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status); - if (nrecv*size_border > maxrecv) - grow_recv(nrecv*size_border); - MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE, - recvproc[iswap],0,world,&request); - MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (nrecv*size_border > maxrecv) grow_recv(nrecv*size_border); + if (nrecv) MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE, + recvproc[iswap],0,world,&request); + if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + if (nrecv) MPI_Wait(&request,&status); buf = buf_recv; } else { nrecv = nsend; @@ -939,10 +1097,12 @@ void Comm::forward_comm_pair(Pair *pair) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (recvnum[iswap]) + MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, + world,&request); + if (sendnum[iswap]) + MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); + if (recvnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -973,10 +1133,12 @@ void Comm::reverse_comm_pair(Pair *pair) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); - MPI_Wait(&request,&status); + if (sendnum[iswap]) + MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, + world,&request); + if (recvnum[iswap]) + MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); + if (sendnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -1008,10 +1170,12 @@ void Comm::forward_comm_fix(Fix *fix) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (recvnum[iswap]) + MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, + world,&request); + if (sendnum[iswap]) + MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); + if (recvnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -1042,10 +1206,12 @@ void Comm::reverse_comm_fix(Fix *fix) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); - MPI_Wait(&request,&status); + if (sendnum[iswap]) + MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, + world,&request); + if (recvnum[iswap]) + MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); + if (sendnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -1077,10 +1243,12 @@ void Comm::forward_comm_compute(Compute *compute) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (recvnum[iswap]) + MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, + world,&request); + if (sendnum[iswap]) + MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); + if (recvnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -1111,10 +1279,12 @@ void Comm::reverse_comm_compute(Compute *compute) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); - MPI_Wait(&request,&status); + if (sendnum[iswap]) + MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, + world,&request); + if (recvnum[iswap]) + MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); + if (sendnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -1146,10 +1316,12 @@ void Comm::forward_comm_dump(Dump *dump) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); - MPI_Wait(&request,&status); + if (recvnum[iswap]) + MPI_Irecv(buf_recv,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0, + world,&request); + if (sendnum[iswap]) + MPI_Send(buf_send,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world); + if (recvnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -1180,10 +1352,12 @@ void Comm::reverse_comm_dump(Dump *dump) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, - world,&request); - MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); - MPI_Wait(&request,&status); + if (sendnum[iswap]) + MPI_Irecv(buf_recv,n*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0, + world,&request); + if (recvnum[iswap]) + MPI_Send(buf_send,n*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,world); + if (sendnum[iswap]) MPI_Wait(&request,&status); buf = buf_recv; } else buf = buf_send; @@ -1373,6 +1547,10 @@ void Comm::set_processors(int narg, char **arg) if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0) error->all(FLERR,"Illegal processors command"); + int p = user_procgrid[0]*user_procgrid[1]*user_procgrid[2]; + if (p && p != nprocs) + error->all(FLERR,"Specified processors != physical processors"); + int iarg = 3; while (iarg < narg) { if (strcmp(arg[iarg],"grid") == 0) { diff --git a/src/comm.h b/src/comm.h index 563ff48fbb..b920d1f5fd 100644 --- a/src/comm.h +++ b/src/comm.h @@ -24,8 +24,10 @@ class Comm : protected Pointers { int procgrid[3]; // procs assigned in each dim of 3d grid int user_procgrid[3]; // user request for procs in each dim int myloc[3]; // which proc I am in each dim - int procneigh[3][2]; // my 6 neighboring procs + int procneigh[3][2]; // my 6 neighboring procs, 0/1 = left/right int ghost_velocity; // 1 if ghost atoms have velocity, 0 if not + int uniform; // 1 = equal subdomains, 0 = load-balanced + double *xsplit,*ysplit,*zsplit; // fractional (0-1) sub-domain sizes double cutghost[3]; // cutoffs used for acquiring ghost atoms double cutghostuser; // user-specified ghost cutoff int ***grid2proc; // which proc owns i,j,k loc in 3d grid @@ -63,8 +65,10 @@ class Comm : protected Pointers { protected: int style; // single vs multi-type comm - int nswap; // # of swaps to perform - int need[3]; // procs I need atoms from in each dim + int nswap; // # of swaps to perform = sum of maxneed + int recvneed[3][2]; // # of procs away I recv atoms from + int sendneed[3][2]; // # of procs away I send atoms to + int maxneed[3]; // max procs away any proc needs, per dim int triclinic; // 0 if domain is orthog, 1 if triclinic int maxswap; // max # of swaps memory is allocated for int size_forward; // # of per-atom datums in forward comm @@ -106,6 +110,8 @@ class Comm : protected Pointers { int maxsend,maxrecv; // current size of send/recv buffer int maxforward,maxreverse; // max # of datums in forward/reverse comm + int updown(int, int, int, double, int, double *); + // compare cutoff to procs virtual void grow_send(int,int); // reallocate send buffer virtual void grow_recv(int); // free/allocate recv buffer virtual void grow_list(int, int); // reallocate one sendlist diff --git a/src/domain.cpp b/src/domain.cpp index a83f958f6a..e35f768ab7 100644 --- a/src/domain.cpp +++ b/src/domain.cpp @@ -215,58 +215,57 @@ void Domain::set_global_box() } /* ---------------------------------------------------------------------- - set lamda box params, only need be done one time - assumes global box is defined and proc assignment has been made by comm - for uppermost proc, insure subhi = 1.0 (in case round-off occurs) + set lamda box params + assumes global box is defined and proc assignment has been made + uses comm->xyz_split to define subbox boundaries in consistent manner ------------------------------------------------------------------------- */ void Domain::set_lamda_box() { int *myloc = comm->myloc; - int *procgrid = comm->procgrid; + double *xsplit = comm->xsplit; + double *ysplit = comm->ysplit; + double *zsplit = comm->zsplit; - sublo_lamda[0] = 1.0*myloc[0] / procgrid[0]; - sublo_lamda[1] = 1.0*myloc[1] / procgrid[1]; - sublo_lamda[2] = 1.0*myloc[2] / procgrid[2]; + sublo_lamda[0] = xsplit[myloc[0]]; + subhi_lamda[0] = xsplit[myloc[0]+1]; - if (myloc[0] < procgrid[0]-1) - subhi_lamda[0] = 1.0*(myloc[0]+1) / procgrid[0]; - else subhi_lamda[0] = 1.0; + sublo_lamda[1] = ysplit[myloc[1]]; + subhi_lamda[1] = ysplit[myloc[1]+1]; - if (myloc[1] < procgrid[1]-1) - subhi_lamda[1] = 1.0*(myloc[1]+1) / procgrid[1]; - else subhi_lamda[1] = 1.0; - - if (myloc[2] < procgrid[2]-1) - subhi_lamda[2] = 1.0*(myloc[2]+1) / procgrid[2]; - else subhi_lamda[2] = 1.0; + sublo_lamda[2] = zsplit[myloc[2]]; + subhi_lamda[2] = zsplit[myloc[2]+1]; } /* ---------------------------------------------------------------------- - set local subbox params + set local subbox params for orthogonal boxes assumes global box is defined and proc assignment has been made - for uppermost proc, insure subhi = boxhi (in case round-off occurs) + uses comm->xyz_split to define subbox boundaries in consistent manner + insure subhi[max] = boxhi ------------------------------------------------------------------------- */ void Domain::set_local_box() { int *myloc = comm->myloc; int *procgrid = comm->procgrid; + double *xsplit = comm->xsplit; + double *ysplit = comm->ysplit; + double *zsplit = comm->zsplit; if (triclinic == 0) { - sublo[0] = boxlo[0] + myloc[0] * xprd / procgrid[0]; + sublo[0] = boxlo[0] + xprd*xsplit[myloc[0]]; if (myloc[0] < procgrid[0]-1) - subhi[0] = boxlo[0] + (myloc[0]+1) * xprd / procgrid[0]; + subhi[0] = boxlo[0] + xprd*xsplit[myloc[0]+1]; else subhi[0] = boxhi[0]; - - sublo[1] = boxlo[1] + myloc[1] * yprd / procgrid[1]; + + sublo[1] = boxlo[1] + yprd*ysplit[myloc[1]]; if (myloc[1] < procgrid[1]-1) - subhi[1] = boxlo[1] + (myloc[1]+1) * yprd / procgrid[1]; + subhi[1] = boxlo[1] + yprd*ysplit[myloc[1]+1]; else subhi[1] = boxhi[1]; - sublo[2] = boxlo[2] + myloc[2] * zprd / procgrid[2]; + sublo[2] = boxlo[2] + zprd*zsplit[myloc[2]]; if (myloc[2] < procgrid[2]-1) - subhi[2] = boxlo[2] + (myloc[2]+1) * zprd / procgrid[2]; + subhi[2] = boxlo[2] + zprd*zsplit[myloc[2]+1]; else subhi[2] = boxhi[2]; } } @@ -274,7 +273,7 @@ void Domain::set_local_box() /* ---------------------------------------------------------------------- reset global & local boxes due to global box boundary changes if shrink-wrapped, determine atom extent and reset boxlo/hi - if triclinic, perform any shrink-wrap in lamda space + for triclinic, atoms must be in lamda coords (0-1) before reset_box is called ------------------------------------------------------------------------- */ void Domain::reset_box() diff --git a/src/irregular.cpp b/src/irregular.cpp index fe801ae45e..f02c401acb 100644 --- a/src/irregular.cpp +++ b/src/irregular.cpp @@ -83,7 +83,8 @@ void Irregular::migrate_atoms() atom->nghost = 0; atom->avec->clear_bonus(); - // subbox bounds for orthogonal or triclinic + // subbox bounds for orthogonal or triclinic box + // other comm/domain data used by coord2proc() double *sublo,*subhi; if (triclinic == 0) { @@ -94,6 +95,13 @@ void Irregular::migrate_atoms() subhi = domain->subhi_lamda; } + uniform = comm->uniform; + xsplit = comm->xsplit; + ysplit = comm->ysplit; + zsplit = comm->zsplit; + boxlo = domain->boxlo; + prd = domain->prd; + // loop over atoms, flag any that are not in my sub-box // fill buffer with atoms leaving my box, using < and >= // assign which proc it belongs to via coord2proc() @@ -603,24 +611,35 @@ void Irregular::destroy_data() /* ---------------------------------------------------------------------- determine which proc owns atom with coord x[3] x will be in box (orthogonal) or lamda coords (triclinic) + for uniform = 1, directly calculate owning proc + for non-uniform, iteratively find owning proc via binary search ------------------------------------------------------------------------- */ int Irregular::coord2proc(double *x) { int loc[3]; - if (triclinic == 0) { - double *boxlo = domain->boxlo; - double *boxhi = domain->boxhi; - loc[0] = static_cast - (procgrid[0] * (x[0]-boxlo[0]) / (boxhi[0]-boxlo[0])); - loc[1] = static_cast - (procgrid[1] * (x[1]-boxlo[1]) / (boxhi[1]-boxlo[1])); - loc[2] = static_cast - (procgrid[2] * (x[2]-boxlo[2]) / (boxhi[2]-boxlo[2])); + + if (uniform) { + if (triclinic == 0) { + loc[0] = static_cast (procgrid[0] * (x[0]-boxlo[0]) / prd[0]); + loc[1] = static_cast (procgrid[1] * (x[1]-boxlo[1]) / prd[1]); + loc[2] = static_cast (procgrid[2] * (x[2]-boxlo[2]) / prd[2]); + } else { + loc[0] = static_cast (procgrid[0] * x[0]); + loc[1] = static_cast (procgrid[1] * x[1]); + loc[2] = static_cast (procgrid[2] * x[2]); + } + } else { - loc[0] = static_cast (procgrid[0] * x[0]); - loc[1] = static_cast (procgrid[1] * x[1]); - loc[2] = static_cast (procgrid[2] * x[2]); + if (triclinic == 0) { + loc[0] = binary((x[0]-boxlo[0])/prd[0],procgrid[0],xsplit); + loc[1] = binary((x[1]-boxlo[1])/prd[1],procgrid[1],ysplit); + loc[2] = binary((x[2]-boxlo[2])/prd[2],procgrid[2],zsplit); + } else { + loc[0] = binary(x[0],procgrid[0],xsplit); + loc[1] = binary(x[1],procgrid[1],ysplit); + loc[2] = binary(x[2],procgrid[2],zsplit); + } } if (loc[0] < 0) loc[0] = 0; @@ -633,6 +652,36 @@ int Irregular::coord2proc(double *x) return grid2proc[loc[0]][loc[1]][loc[2]]; } +/* ---------------------------------------------------------------------- + binary search for value in N-length ascending vec + value may be outside range of vec limits + always return index from 0 to N-1 inclusive + return 0 if value < vec[0] + reutrn N-1 if value >= vec[N-1] + return index = 1 to N-2 if vec[index] <= value < vec[index+1] +------------------------------------------------------------------------- */ + +int Irregular::binary(double value, int n, double *vec) +{ + int lo = 0; + int hi = n-1; + + if (value < vec[lo]) return lo; + if (value >= vec[hi]) return hi; + + // insure vec[lo] <= value < vec[hi] at every iteration + // done when lo,hi are adjacent + + int index = (lo+hi)/2; + while (lo < hi-1) { + if (value < vec[index]) hi = index; + else if (value >= vec[index]) lo = index; + index = (lo+hi)/2; + } + + return index; +} + /* ---------------------------------------------------------------------- realloc the size of the send buffer as needed with BUFFACTOR & BUFEXTRA if flag = 1, realloc diff --git a/src/irregular.h b/src/irregular.h index 06b840171f..1b1477872b 100644 --- a/src/irregular.h +++ b/src/irregular.h @@ -32,8 +32,12 @@ class Irregular : protected Pointers { int me,nprocs; int triclinic; int map_style; - int *procgrid; - int ***grid2proc; + int uniform; + double *xsplit,*ysplit,*zsplit; // ptrs to comm + int *procgrid; // ptr to comm + int ***grid2proc; // ptr to comm + double *boxlo; // ptr to domain + double *prd; // ptr to domain int maxsend,maxrecv; // size of buffers in # of doubles double *buf_send,*buf_recv; @@ -81,6 +85,7 @@ class Irregular : protected Pointers { void exchange_atom(double *, int *, double *); void destroy_atom(); int coord2proc(double *); + int binary(double, int, double *); void grow_send(int,int); // reallocate send buffer void grow_recv(int); // free/allocate recv buffer