Files
lammps/src/comm.cpp
2022-08-27 10:23:58 -04:00

1372 lines
51 KiB
C++

// clang-format off
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "comm.h"
#include "accelerator_kokkos.h"
#include "atom.h" // IWYU pragma: keep
#include "atom_vec.h"
#include "bond.h"
#include "compute.h"
#include "domain.h" // IWYU pragma: keep
#include "dump.h"
#include "error.h"
#include "fix.h"
#include "force.h"
#include "group.h"
#include "irregular.h"
#include "memory.h" // IWYU pragma: keep
#include "modify.h"
#include "neighbor.h" // IWYU pragma: keep
#include "output.h"
#include "pair.h"
#include "procmap.h"
#include "universe.h"
#include "update.h"
#include <cstring>
#ifdef _OPENMP
#include <omp.h>
#endif
using namespace LAMMPS_NS;
#define BUFEXTRA 1024
enum{ONELEVEL,TWOLEVEL,NUMA,CUSTOM};
enum{CART,CARTREORDER,XYZ};
/* ---------------------------------------------------------------------- */
Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
{
MPI_Comm_rank(world,&me);
MPI_Comm_size(world,&nprocs);
mode = 0;
bordergroup = 0;
cutghostuser = 0.0;
cutusermulti = nullptr;
cutusermultiold = nullptr;
ncollections = 0;
ncollections_cutoff = 0;
ghost_velocity = 0;
user_procgrid[0] = user_procgrid[1] = user_procgrid[2] = 0;
coregrid[0] = coregrid[1] = coregrid[2] = 1;
gridflag = ONELEVEL;
mapflag = CART;
customfile = nullptr;
outfile = nullptr;
recv_from_partition = send_to_partition = -1;
otherflag = 0;
maxexchange = maxexchange_atom = maxexchange_fix = 0;
maxexchange_fix_dynamic = 0;
bufextra = BUFEXTRA;
grid2proc = nullptr;
xsplit = ysplit = zsplit = nullptr;
rcbnew = 0;
multi_reduce = 0;
// use of OpenMP threads
// query OpenMP for number of threads/process set by user at run-time
// if the OMP_NUM_THREADS environment variable is not set, we default
// to using 1 thread. This follows the principle of the least surprise,
// while practically all OpenMP implementations violate it by using
// as many threads as there are (virtual) CPU cores by default.
nthreads = 1;
#ifdef _OPENMP
if (lmp->kokkos) {
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
} else if (getenv("OMP_NUM_THREADS") == nullptr) {
nthreads = 1;
if (me == 0)
error->message(FLERR,"OMP_NUM_THREADS environment is not set. "
"Defaulting to 1 thread.");
} else {
nthreads = omp_get_max_threads();
}
// enforce consistent number of threads across all MPI tasks
MPI_Bcast(&nthreads,1,MPI_INT,0,world);
if (!lmp->kokkos) omp_set_num_threads(nthreads);
if (me == 0)
utils::logmesg(lmp," using {} OpenMP thread(s) per MPI task\n",nthreads);
#endif
}
/* ---------------------------------------------------------------------- */
Comm::~Comm()
{
memory->destroy(grid2proc);
memory->destroy(xsplit);
memory->destroy(ysplit);
memory->destroy(zsplit);
memory->destroy(cutusermulti);
memory->destroy(cutusermultiold);
delete [] customfile;
delete [] outfile;
}
/* ----------------------------------------------------------------------
deep copy of arrays from old Comm class to new one
all public/protected vectors/arrays in parent Comm class must be copied
called from alternate constructor of child classes
when new comm style is created from Input
------------------------------------------------------------------------- */
void Comm::copy_arrays(Comm *oldcomm)
{
if (oldcomm->grid2proc) {
memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
"comm:grid2proc");
memcpy(&grid2proc[0][0][0],&oldcomm->grid2proc[0][0][0],
(procgrid[0]*procgrid[1]*procgrid[2])*sizeof(int));
memory->create(xsplit,procgrid[0]+1,"comm:xsplit");
memory->create(ysplit,procgrid[1]+1,"comm:ysplit");
memory->create(zsplit,procgrid[2]+1,"comm:zsplit");
memcpy(xsplit,oldcomm->xsplit,(procgrid[0]+1)*sizeof(double));
memcpy(ysplit,oldcomm->ysplit,(procgrid[1]+1)*sizeof(double));
memcpy(zsplit,oldcomm->zsplit,(procgrid[2]+1)*sizeof(double));
}
ncollections = oldcomm->ncollections;
ncollections_cutoff = oldcomm->ncollections_cutoff;
if (oldcomm->cutusermulti) {
memory->create(cutusermulti,ncollections_cutoff,"comm:cutusermulti");
memcpy(cutusermulti,oldcomm->cutusermulti,ncollections_cutoff);
}
if (oldcomm->cutusermultiold) {
memory->create(cutusermultiold,atom->ntypes+1,"comm:cutusermultiold");
memcpy(cutusermultiold,oldcomm->cutusermultiold,atom->ntypes+1);
}
if (customfile)
customfile = utils::strdup(oldcomm->customfile);
if (outfile)
outfile = utils::strdup(oldcomm->outfile);
}
/* ----------------------------------------------------------------------
common to all Comm styles
------------------------------------------------------------------------- */
void Comm::init()
{
triclinic = domain->triclinic;
map_style = atom->map_style;
// check warn if any proc's subbox is smaller than neigh skin
// since may lead to lost atoms in exchange()
// really should check every exchange() in case box size is shrinking
// but seems overkill to do that (fix balance does perform this check)
domain->subbox_too_small_check(neighbor->skin);
// comm_only = 1 if only x,f are exchanged in forward/reverse comm
// comm_x_only = 0 if ghost_velocity since velocities are added
comm_x_only = atom->avec->comm_x_only;
comm_f_only = atom->avec->comm_f_only;
if (ghost_velocity) comm_x_only = 0;
// set per-atom sizes for forward/reverse/border comm
// augment by velocity and fix quantities if needed
size_forward = atom->avec->size_forward;
size_reverse = atom->avec->size_reverse;
size_border = atom->avec->size_border;
if (ghost_velocity) size_forward += atom->avec->size_velocity;
if (ghost_velocity) size_border += atom->avec->size_velocity;
const auto &fix_list = modify->get_fix_list();
for (const auto &fix : fix_list)
size_border += fix->comm_border;
// per-atom limits for communication
// maxexchange = max # of datums in exchange comm, set in exchange()
// maxforward = # of datums in largest forward comm
// maxreverse = # of datums in largest reverse comm
// query pair,fix,compute,dump for their requirements
// pair style can force reverse comm even if newton off
maxforward = MAX(size_forward,size_border);
maxreverse = size_reverse;
if (force->pair) maxforward = MAX(maxforward,force->pair->comm_forward);
if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse);
if (force->bond) maxforward = MAX(maxforward,force->bond->comm_forward);
if (force->bond) maxreverse = MAX(maxreverse,force->bond->comm_reverse);
for (const auto &fix : fix_list) {
maxforward = MAX(maxforward, fix->comm_forward);
maxreverse = MAX(maxreverse, fix->comm_reverse);
}
for (const auto &compute : modify->get_compute_list()) {
maxforward = MAX(maxforward,compute->comm_forward);
maxreverse = MAX(maxreverse,compute->comm_reverse);
}
for (const auto &dump: output->get_dump_list()) {
maxforward = MAX(maxforward,dump->comm_forward);
maxreverse = MAX(maxreverse,dump->comm_reverse);
}
if (force->newton == 0) maxreverse = 0;
if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse_off);
if (force->bond) maxreverse = MAX(maxreverse,force->bond->comm_reverse_off);
// maxexchange_atom = size of an exchanged atom, set by AtomVec
// only needs to be set if size > BUFEXTRA
// maxexchange_fix_dynamic = 1 if any fix sets its maxexchange dynamically
maxexchange_atom = atom->avec->maxexchange;
maxexchange_fix_dynamic = 0;
for (const auto &fix : fix_list) if (fix->maxexchange_dynamic) maxexchange_fix_dynamic = 1;
if ((mode == Comm::MULTI) && (neighbor->style != Neighbor::MULTI))
error->all(FLERR,"Cannot use comm mode multi without multi-style neighbor lists");
if (multi_reduce) {
if (force->newton == 0)
error->all(FLERR,"Cannot use multi/reduce communication with Newton off");
if (neighbor->any_full())
error->all(FLERR,"Cannot use multi/reduce communication with a full neighbor list");
if (mode != Comm::MULTI)
error->all(FLERR,"Cannot use multi/reduce communication without mode multi");
}
}
/* ----------------------------------------------------------------------
set maxexchange based on AtomVec and fixes
------------------------------------------------------------------------- */
void Comm::init_exchange()
{
maxexchange_fix = 0;
for (const auto &fix : modify->get_fix_list()) maxexchange_fix += fix->maxexchange;
maxexchange = maxexchange_atom + maxexchange_fix;
bufextra = maxexchange + BUFEXTRA;
}
/* ----------------------------------------------------------------------
modify communication params
invoked from input script by comm_modify command
------------------------------------------------------------------------- */
void Comm::modify_params(int narg, char **arg)
{
if (narg < 1) utils::missing_cmd_args(FLERR, "comm_modify", error);
int iarg = 0;
while (iarg < narg) {
if (strcmp(arg[iarg],"mode") == 0) {
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify mode", error);
if (strcmp(arg[iarg+1],"single") == 0) {
// need to reset cutghostuser when switching comm mode
if (mode == Comm::MULTI) cutghostuser = 0.0;
if (mode == Comm::MULTIOLD) cutghostuser = 0.0;
memory->destroy(cutusermulti);
memory->destroy(cutusermultiold);
mode = Comm::SINGLE;
} else if (strcmp(arg[iarg+1],"multi") == 0) {
if (neighbor->style != Neighbor::MULTI)
error->all(FLERR,"Cannot use comm mode 'multi' without 'multi' style neighbor lists");
// need to reset cutghostuser when switching comm mode
if (mode == Comm::SINGLE) cutghostuser = 0.0;
if (mode == Comm::MULTIOLD) cutghostuser = 0.0;
memory->destroy(cutusermultiold);
mode = Comm::MULTI;
} else if (strcmp(arg[iarg+1],"multi/old") == 0) {
if (neighbor->style == Neighbor::MULTI)
error->all(FLERR,"Cannot use comm mode 'multi/old' with 'multi' style neighbor lists");
// need to reset cutghostuser when switching comm mode
if (mode == Comm::SINGLE) cutghostuser = 0.0;
if (mode == Comm::MULTI) cutghostuser = 0.0;
memory->destroy(cutusermulti);
mode = Comm::MULTIOLD;
} else error->all(FLERR,"Unknown comm_modify mode argument: {}", arg[iarg+1]);
iarg += 2;
} else if (strcmp(arg[iarg],"group") == 0) {
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify group", error);
bordergroup = group->find(arg[iarg+1]);
if (bordergroup < 0)
error->all(FLERR, "Invalid comm_modify keyword: group {} not found", arg[iarg+1]);
if (bordergroup && ((atom->firstgroupname == nullptr) || strcmp(arg[iarg+1],atom->firstgroupname) != 0))
error->all(FLERR, "Comm_modify group != atom_modify first group: {}", atom->firstgroupname);
iarg += 2;
} else if (strcmp(arg[iarg],"cutoff") == 0) {
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify cutoff", error);
if (mode == Comm::MULTI)
error->all(FLERR, "Use cutoff/multi keyword to set cutoff in multi mode");
if (mode == Comm::MULTIOLD)
error->all(FLERR, "Use cutoff/multi/old keyword to set cutoff in multi mode");
cutghostuser = utils::numeric(FLERR,arg[iarg+1],false,lmp);
if (cutghostuser < 0.0)
error->all(FLERR,"Invalid cutoff {} in comm_modify command", arg[iarg+1]);
iarg += 2;
} else if (strcmp(arg[iarg],"cutoff/multi") == 0) {
int i,nlo,nhi;
double cut;
if (mode == Comm::SINGLE)
error->all(FLERR,"Use cutoff keyword to set cutoff in single mode");
if (mode == Comm::MULTIOLD)
error->all(FLERR,"Use cutoff/multi/old keyword to set cutoff in multi/old mode");
if (domain->box_exist == 0)
error->all(FLERR, "Cannot set cutoff/multi before simulation box is defined");
// Check if # of collections has changed, if so erase any previously defined cutoffs
// Neighbor will reset ncollections if collections are redefined
if (! cutusermulti || ncollections_cutoff != neighbor->ncollections) {
ncollections_cutoff = neighbor->ncollections;
memory->destroy(cutusermulti);
memory->create(cutusermulti,ncollections_cutoff,"comm:cutusermulti");
for (i=0; i < ncollections_cutoff; ++i)
cutusermulti[i] = -1.0;
}
utils::bounds(FLERR,arg[iarg+1],1,ncollections_cutoff,nlo,nhi,error);
cut = utils::numeric(FLERR,arg[iarg+2],false,lmp);
cutghostuser = MAX(cutghostuser,cut);
if (cut < 0.0)
error->all(FLERR,"Invalid cutoff {} in comm_modify command", arg[iarg+2]);
// collections use 1-based indexing externally and 0-based indexing internally
for (i=nlo; i<=nhi; ++i)
cutusermulti[i-1] = cut;
iarg += 3;
} else if (strcmp(arg[iarg],"cutoff/multi/old") == 0) {
int i,nlo,nhi;
double cut;
if (mode == Comm::SINGLE)
error->all(FLERR,"Use cutoff keyword to set cutoff in single mode");
if (mode == Comm::MULTI)
error->all(FLERR,"Use cutoff/multi keyword to set cutoff in multi mode");
if (domain->box_exist == 0)
error->all(FLERR, "Cannot set cutoff/multi before simulation box is defined");
const int ntypes = atom->ntypes;
if (iarg+3 > narg) utils::missing_cmd_args(FLERR, "comm_modify cutoff/multi/old", error);
if (cutusermultiold == nullptr) {
memory->create(cutusermultiold,ntypes+1,"comm:cutusermultiold");
for (i=0; i < ntypes+1; ++i)
cutusermultiold[i] = -1.0;
}
utils::bounds(FLERR,arg[iarg+1],1,ntypes,nlo,nhi,error);
cut = utils::numeric(FLERR,arg[iarg+2],false,lmp);
cutghostuser = MAX(cutghostuser,cut);
if (cut < 0.0)
error->all(FLERR,"Invalid cutoff {} in comm_modify command", arg[iarg+2]);
for (i=nlo; i<=nhi; ++i)
cutusermultiold[i] = cut;
iarg += 3;
} else if (strcmp(arg[iarg],"reduce/multi") == 0) {
if (mode == Comm::SINGLE)
error->all(FLERR,"Use reduce/multi in mode multi only");
multi_reduce = 1;
iarg += 1;
} else if (strcmp(arg[iarg],"vel") == 0) {
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify vel", error);
ghost_velocity = utils::logical(FLERR,arg[iarg+1],false,lmp);
iarg += 2;
} else error->all(FLERR,"Unknown comm_modify keyword: {}", arg[iarg]);
}
}
/* ----------------------------------------------------------------------
set dimensions for 3d grid of processors, and associated flags
invoked from input script by processors command
------------------------------------------------------------------------- */
void Comm::set_processors(int narg, char **arg)
{
if (narg < 3) error->all(FLERR,"Illegal processors command");
if (strcmp(arg[0],"*") == 0) user_procgrid[0] = 0;
else user_procgrid[0] = utils::inumeric(FLERR,arg[0],false,lmp);
if (strcmp(arg[1],"*") == 0) user_procgrid[1] = 0;
else user_procgrid[1] = utils::inumeric(FLERR,arg[1],false,lmp);
if (strcmp(arg[2],"*") == 0) user_procgrid[2] = 0;
else user_procgrid[2] = utils::inumeric(FLERR,arg[2],false,lmp);
if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0)
error->all(FLERR,"Illegal processors command");
int p = user_procgrid[0]*user_procgrid[1]*user_procgrid[2];
if (p && p != nprocs)
error->all(FLERR,"Specified processors != physical processors");
int iarg = 3;
while (iarg < narg) {
if (strcmp(arg[iarg],"grid") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
if (strcmp(arg[iarg+1],"onelevel") == 0) {
gridflag = ONELEVEL;
} else if (strcmp(arg[iarg+1],"twolevel") == 0) {
if (iarg+6 > narg) error->all(FLERR,"Illegal processors command");
gridflag = TWOLEVEL;
ncores = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
if (strcmp(arg[iarg+3],"*") == 0) user_coregrid[0] = 0;
else user_coregrid[0] = utils::inumeric(FLERR,arg[iarg+3],false,lmp);
if (strcmp(arg[iarg+4],"*") == 0) user_coregrid[1] = 0;
else user_coregrid[1] = utils::inumeric(FLERR,arg[iarg+4],false,lmp);
if (strcmp(arg[iarg+5],"*") == 0) user_coregrid[2] = 0;
else user_coregrid[2] = utils::inumeric(FLERR,arg[iarg+5],false,lmp);
if (ncores <= 0 || user_coregrid[0] < 0 ||
user_coregrid[1] < 0 || user_coregrid[2] < 0)
error->all(FLERR,"Illegal processors command");
iarg += 4;
} else if (strcmp(arg[iarg+1],"numa") == 0) {
gridflag = NUMA;
} else if (strcmp(arg[iarg+1],"custom") == 0) {
if (iarg+3 > narg) error->all(FLERR,"Illegal processors command");
gridflag = CUSTOM;
delete [] customfile;
customfile = utils::strdup(arg[iarg+2]);
iarg += 1;
} else error->all(FLERR,"Illegal processors command");
iarg += 2;
} else if (strcmp(arg[iarg],"map") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
if (strcmp(arg[iarg+1],"cart") == 0) mapflag = CART;
else if (strcmp(arg[iarg+1],"cart/reorder") == 0) mapflag = CARTREORDER;
else if (strcmp(arg[iarg+1],"xyz") == 0 ||
strcmp(arg[iarg+1],"xzy") == 0 ||
strcmp(arg[iarg+1],"yxz") == 0 ||
strcmp(arg[iarg+1],"yzx") == 0 ||
strcmp(arg[iarg+1],"zxy") == 0 ||
strcmp(arg[iarg+1],"zyx") == 0) {
mapflag = XYZ;
strncpy(xyz,arg[iarg+1],3);
} else error->all(FLERR,"Illegal processors command");
iarg += 2;
} else if (strcmp(arg[iarg],"part") == 0) {
if (iarg+4 > narg) error->all(FLERR,"Illegal processors command");
if (universe->nworlds == 1)
error->all(FLERR,
"Cannot use processors part command "
"without using partitions");
int isend = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
int irecv = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
if (isend < 1 || isend > universe->nworlds ||
irecv < 1 || irecv > universe->nworlds || isend == irecv)
error->all(FLERR,"Invalid partitions in processors part command");
if (isend-1 == universe->iworld) {
if (send_to_partition >= 0)
error->all(FLERR,
"Sending partition in processors part command "
"is already a sender");
send_to_partition = irecv-1;
}
if (irecv-1 == universe->iworld) {
if (recv_from_partition >= 0)
error->all(FLERR,
"Receiving partition in processors part command "
"is already a receiver");
recv_from_partition = isend-1;
}
// only receiver has otherflag dependency
if (strcmp(arg[iarg+3],"multiple") == 0) {
if (universe->iworld == irecv-1) {
otherflag = 1;
other_style = Comm::MULTIPLE;
}
} else error->all(FLERR,"Illegal processors command");
iarg += 4;
} else if (strcmp(arg[iarg],"file") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
delete [] outfile;
outfile = utils::strdup(arg[iarg+1]);
iarg += 2;
} else error->all(FLERR,"Illegal processors command");
}
// error checks
if (gridflag == NUMA && mapflag != CART)
error->all(FLERR,"Processors grid numa and map style are incompatible");
if (otherflag && (gridflag == NUMA || gridflag == CUSTOM))
error->all(FLERR,
"Processors part option and grid style are incompatible");
}
/* ----------------------------------------------------------------------
create a 3d grid of procs based on Nprocs and box size & shape
map processors to grid, setup xyz split for a uniform grid
------------------------------------------------------------------------- */
void Comm::set_proc_grid(int outflag)
{
// recv 3d proc grid of another partition if my 3d grid depends on it
if (recv_from_partition >= 0) {
if (me == 0) {
MPI_Recv(other_procgrid,3,MPI_INT,
universe->root_proc[recv_from_partition],0,
universe->uworld,MPI_STATUS_IGNORE);
MPI_Recv(other_coregrid,3,MPI_INT,
universe->root_proc[recv_from_partition],0,
universe->uworld,MPI_STATUS_IGNORE);
}
MPI_Bcast(other_procgrid,3,MPI_INT,0,world);
MPI_Bcast(other_coregrid,3,MPI_INT,0,world);
}
// create ProcMap class to create 3d grid and map procs to it
auto pmap = new ProcMap(lmp);
// create 3d grid of processors
// produces procgrid and coregrid (if relevant)
if (gridflag == ONELEVEL) {
pmap->onelevel_grid(nprocs,user_procgrid,procgrid,
otherflag,other_style,other_procgrid,other_coregrid);
} else if (gridflag == TWOLEVEL) {
pmap->twolevel_grid(nprocs,user_procgrid,procgrid,
ncores,user_coregrid,coregrid,
otherflag,other_style,other_procgrid,other_coregrid);
} else if (gridflag == NUMA) {
pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
} else if (gridflag == CUSTOM) {
pmap->custom_grid(customfile,nprocs,user_procgrid,procgrid);
}
// error check on procgrid
// should not be necessary due to ProcMap
if (procgrid[0]*procgrid[1]*procgrid[2] != nprocs)
error->all(FLERR,"Bad grid of processors");
if (domain->dimension == 2 && procgrid[2] != 1)
error->all(FLERR,"Processor count in z must be 1 for 2d simulation");
// grid2proc[i][j][k] = proc that owns i,j,k location in 3d grid
if (grid2proc) memory->destroy(grid2proc);
memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
"comm:grid2proc");
// map processor IDs to 3d processor grid
// produces myloc, procneigh, grid2proc
if (gridflag == ONELEVEL) {
if (mapflag == CART)
pmap->cart_map(0,procgrid,myloc,procneigh,grid2proc);
else if (mapflag == CARTREORDER)
pmap->cart_map(1,procgrid,myloc,procneigh,grid2proc);
else if (mapflag == XYZ)
pmap->xyz_map(xyz,procgrid,myloc,procneigh,grid2proc);
} else if (gridflag == TWOLEVEL) {
if (mapflag == CART)
pmap->cart_map(0,procgrid,ncores,coregrid,myloc,procneigh,grid2proc);
else if (mapflag == CARTREORDER)
pmap->cart_map(1,procgrid,ncores,coregrid,myloc,procneigh,grid2proc);
else if (mapflag == XYZ)
pmap->xyz_map(xyz,procgrid,ncores,coregrid,myloc,procneigh,grid2proc);
} else if (gridflag == NUMA) {
pmap->numa_map(0,coregrid,myloc,procneigh,grid2proc);
} else if (gridflag == CUSTOM) {
pmap->custom_map(procgrid,myloc,procneigh,grid2proc);
}
// print 3d grid info to screen and logfile
if (outflag && me == 0) {
auto mesg = fmt::format(" {} by {} by {} MPI processor grid\n",
procgrid[0],procgrid[1],procgrid[2]);
if (gridflag == NUMA || gridflag == TWOLEVEL)
mesg += fmt::format(" {} by {} by {} core grid within node\n",
coregrid[0],coregrid[1],coregrid[2]);
utils::logmesg(lmp,mesg);
}
// print 3d grid details to outfile
if (outfile) pmap->output(outfile,procgrid,grid2proc);
// free ProcMap class
delete pmap;
// set xsplit,ysplit,zsplit for uniform spacings
memory->destroy(xsplit);
memory->destroy(ysplit);
memory->destroy(zsplit);
memory->create(xsplit,procgrid[0]+1,"comm:xsplit");
memory->create(ysplit,procgrid[1]+1,"comm:ysplit");
memory->create(zsplit,procgrid[2]+1,"comm:zsplit");
for (int i = 0; i < procgrid[0]; i++) xsplit[i] = i * 1.0/procgrid[0];
for (int i = 0; i < procgrid[1]; i++) ysplit[i] = i * 1.0/procgrid[1];
for (int i = 0; i < procgrid[2]; i++) zsplit[i] = i * 1.0/procgrid[2];
xsplit[procgrid[0]] = ysplit[procgrid[1]] = zsplit[procgrid[2]] = 1.0;
// set lamda box params after procs are assigned
// only set once unless load-balancing occurs
if (domain->triclinic) domain->set_lamda_box();
// send my 3d proc grid to another partition if requested
if (send_to_partition >= 0) {
if (me == 0) {
MPI_Send(procgrid,3,MPI_INT,
universe->root_proc[send_to_partition],0,
universe->uworld);
MPI_Send(coregrid,3,MPI_INT,
universe->root_proc[send_to_partition],0,
universe->uworld);
}
}
}
/* ----------------------------------------------------------------------
determine suitable communication cutoff.
this uses three inputs: 1) maximum neighborlist cutoff, 2) an estimate
based on bond lengths and bonded interaction styles present, and 3) a
user supplied communication cutoff.
the neighbor list cutoff (1) is *always* used, since it is a requirement
for neighborlists working correctly. the bond length based cutoff is
*only* used, if no pair style is defined and no user cutoff is provided.
otherwise, a warning is printed. if the bond length based estimate is
larger than what is used.
print a warning, if a user specified communication cutoff is overridden.
------------------------------------------------------------------------- */
double Comm::get_comm_cutoff()
{
double maxcommcutoff, maxbondcutoff = 0.0;
if (force->bond) {
int n = atom->nbondtypes;
for (int i = 1; i <= n; ++i)
maxbondcutoff = MAX(maxbondcutoff,force->bond->equilibrium_distance(i));
// apply bond length based heuristics.
if (force->newton_bond) {
if (force->dihedral || force->improper) {
maxbondcutoff *= 2.25;
} else {
maxbondcutoff *=1.5;
}
} else {
if (force->dihedral || force->improper) {
maxbondcutoff *= 3.125;
} else if (force->angle) {
maxbondcutoff *= 2.25;
} else {
maxbondcutoff *=1.5;
}
}
maxbondcutoff += neighbor->skin;
}
// always take the larger of max neighbor list and user specified cutoff
maxcommcutoff = MAX(cutghostuser,neighbor->cutneighmax);
// use cutoff estimate from bond length only if no user specified
// cutoff was given and no pair style present. Otherwise print a
// warning, if the estimated bond based cutoff is larger than what
// is currently used.
if (!force->pair && (cutghostuser == 0.0)) {
maxcommcutoff = MAX(maxcommcutoff,maxbondcutoff);
} else {
if ((me == 0) && (maxbondcutoff > maxcommcutoff))
error->warning(FLERR,"Communication cutoff {} is shorter than a bond "
"length based estimate of {}. This may lead to errors.",
maxcommcutoff,maxbondcutoff);
}
// print warning if neighborlist cutoff overrides user cutoff
if ((me == 0) && (update->setupflag == 1)) {
if ((cutghostuser > 0.0) && (maxcommcutoff > cutghostuser))
error->warning(FLERR,"Communication cutoff adjusted to {}",maxcommcutoff);
}
// check maximum interval size for neighbor multi
if (neighbor->interval_collection_flag) {
for (int i = 0; i < neighbor->ncollections; i++){
maxcommcutoff = MAX(maxcommcutoff, neighbor->collection2cut[i]);
}
}
return maxcommcutoff;
}
/* ----------------------------------------------------------------------
determine which proc owns atom with coord x[3] based on current decomp
x will be in box (orthogonal) or lamda coords (triclinic)
if layout = UNIFORM, calculate owning proc directly
if layout = NONUNIFORM, iteratively find owning proc via binary search
if layout = TILED, CommTiled has its own method
return owning proc ID via grid2proc
return igx,igy,igz = logical grid loc of owing proc within 3d grid of procs
------------------------------------------------------------------------- */
int Comm::coord2proc(double *x, int &igx, int &igy, int &igz)
{
double *prd = domain->prd;
double *boxlo = domain->boxlo;
// initialize triclinic b/c coord2proc can be called before Comm::init()
// via Irregular::migrate_atoms()
triclinic = domain->triclinic;
if (layout == Comm::LAYOUT_UNIFORM) {
if (triclinic == 0) {
igx = static_cast<int> (procgrid[0] * (x[0]-boxlo[0]) / prd[0]);
igy = static_cast<int> (procgrid[1] * (x[1]-boxlo[1]) / prd[1]);
igz = static_cast<int> (procgrid[2] * (x[2]-boxlo[2]) / prd[2]);
} else {
igx = static_cast<int> (procgrid[0] * x[0]);
igy = static_cast<int> (procgrid[1] * x[1]);
igz = static_cast<int> (procgrid[2] * x[2]);
}
} else if (layout == Comm::LAYOUT_NONUNIFORM) {
if (triclinic == 0) {
igx = utils::binary_search((x[0]-boxlo[0])/prd[0],procgrid[0],xsplit);
igy = utils::binary_search((x[1]-boxlo[1])/prd[1],procgrid[1],ysplit);
igz = utils::binary_search((x[2]-boxlo[2])/prd[2],procgrid[2],zsplit);
} else {
igx = utils::binary_search(x[0],procgrid[0],xsplit);
igy = utils::binary_search(x[1],procgrid[1],ysplit);
igz = utils::binary_search(x[2],procgrid[2],zsplit);
}
}
if (igx < 0) igx = 0;
if (igx >= procgrid[0]) igx = procgrid[0] - 1;
if (igy < 0) igy = 0;
if (igy >= procgrid[1]) igy = procgrid[1] - 1;
if (igz < 0) igz = 0;
if (igz >= procgrid[2]) igz = procgrid[2] - 1;
return grid2proc[igx][igy][igz];
}
/* ----------------------------------------------------------------------
partition a global regular grid into one brick-shaped sub-grid per proc
if grid point is inside my sub-domain I own it,
this includes sub-domain lo boundary but excludes hi boundary
nx,ny,nz = extent of global grid
indices into the global grid range from 0 to N-1 in each dim
zfactor = 0.0 if the grid exactly covers the simulation box
zfactor > 1.0 if the grid extends beyond the +z boundary by this factor
used by 2d slab-mode PPPM
this effectively maps proc sub-grids to a smaller subset of the grid
nxyz lo/hi = inclusive lo/hi bounds of global grid sub-brick I own
if proc owns no grid cells in a dim, then nlo > nhi
special case: 2 procs share boundary which a grid point is exactly on
2 equality if tests insure a consistent decision as to which proc owns it
------------------------------------------------------------------------- */
void Comm::partition_grid(int nx, int ny, int nz, double zfactor,
int &nxlo, int &nxhi, int &nylo, int &nyhi,
int &nzlo, int &nzhi)
{
double xfraclo,xfrachi,yfraclo,yfrachi,zfraclo,zfrachi;
if (layout != LAYOUT_TILED) {
xfraclo = xsplit[myloc[0]];
xfrachi = xsplit[myloc[0]+1];
yfraclo = ysplit[myloc[1]];
yfrachi = ysplit[myloc[1]+1];
zfraclo = zsplit[myloc[2]];
zfrachi = zsplit[myloc[2]+1];
} else {
xfraclo = mysplit[0][0];
xfrachi = mysplit[0][1];
yfraclo = mysplit[1][0];
yfrachi = mysplit[1][1];
zfraclo = mysplit[2][0];
zfrachi = mysplit[2][1];
}
nxlo = static_cast<int> (xfraclo * nx);
if (1.0*nxlo != xfraclo*nx) nxlo++;
nxhi = static_cast<int> (xfrachi * nx);
if (1.0*nxhi == xfrachi*nx) nxhi--;
nylo = static_cast<int> (yfraclo * ny);
if (1.0*nylo != yfraclo*ny) nylo++;
nyhi = static_cast<int> (yfrachi * ny);
if (1.0*nyhi == yfrachi*ny) nyhi--;
if (zfactor == 0.0) {
nzlo = static_cast<int> (zfraclo * nz);
if (1.0*nzlo != zfraclo*nz) nzlo++;
nzhi = static_cast<int> (zfrachi * nz);
if (1.0*nzhi == zfrachi*nz) nzhi--;
} else {
nzlo = static_cast<int> (zfraclo * nz/zfactor);
if (1.0*nzlo != zfraclo*nz) nzlo++;
nzhi = static_cast<int> (zfrachi * nz/zfactor);
if (1.0*nzhi == zfrachi*nz) nzhi--;
}
// OLD code
// could sometimes map grid points slightly outside a proc to the proc
/*
if (layout != LAYOUT_TILED) {
nxlo = static_cast<int> (xsplit[myloc[0]] * nx);
nxhi = static_cast<int> (xsplit[myloc[0]+1] * nx) - 1;
nylo = static_cast<int> (ysplit[myloc[1]] * ny);
nyhi = static_cast<int> (ysplit[myloc[1]+1] * ny) - 1;
if (zfactor == 0.0) {
nzlo = static_cast<int> (zsplit[myloc[2]] * nz);
nzhi = static_cast<int> (zsplit[myloc[2]+1] * nz) - 1;
} else {
nzlo = static_cast<int> (zsplit[myloc[2]] * nz/zfactor);
nzhi = static_cast<int> (zsplit[myloc[2]+1] * nz/zfactor) - 1;
}
} else {
nxlo = static_cast<int> (mysplit[0][0] * nx);
nxhi = static_cast<int> (mysplit[0][1] * nx) - 1;
nylo = static_cast<int> (mysplit[1][0] * ny);
nyhi = static_cast<int> (mysplit[1][1] * ny) - 1;
if (zfactor == 0.0) {
nzlo = static_cast<int> (mysplit[2][0] * nz);
nzhi = static_cast<int> (mysplit[2][1] * nz) - 1;
} else {
nzlo = static_cast<int> (mysplit[2][0] * nz/zfactor);
nzhi = static_cast<int> (mysplit[2][1] * nz/zfactor) - 1;
}
}
*/
}
/* ----------------------------------------------------------------------
communicate inbuf around full ring of processors with messtag
nbytes = size of inbuf = n datums * nper bytes
callback() is invoked to allow caller to process/update each proc's inbuf
if self=1 (default), then callback() is invoked on final iteration
using original inbuf, which may have been updated
for non-nullptr outbuf, final updated inbuf is copied to it
ok to specify outbuf = inbuf
the ptr argument is a pointer to the instance of calling class
------------------------------------------------------------------------- */
void Comm::ring(int n, int nper, void *inbuf, int messtag,
void (*callback)(int, char *, void *),
void *outbuf, void *ptr, int self)
{
MPI_Request request;
MPI_Status status;
int nbytes = n*nper;
int maxbytes;
MPI_Allreduce(&nbytes,&maxbytes,1,MPI_INT,MPI_MAX,world);
// no need to communicate without data
if (maxbytes == 0) return;
// sanity check
if ((nbytes > 0) && inbuf == nullptr)
error->one(FLERR,"Cannot put data on ring from NULL pointer");
char *buf,*bufcopy;
memory->create(buf,maxbytes,"comm:buf");
memory->create(bufcopy,maxbytes,"comm:bufcopy");
if (nbytes && inbuf) memcpy(buf,inbuf,nbytes);
int next = me + 1;
int prev = me - 1;
if (next == nprocs) next = 0;
if (prev < 0) prev = nprocs - 1;
for (int loop = 0; loop < nprocs; loop++) {
if (me != next) {
MPI_Irecv(bufcopy,maxbytes,MPI_CHAR,prev,messtag,world,&request);
MPI_Send(buf,nbytes,MPI_CHAR,next,messtag,world);
MPI_Wait(&request,&status);
MPI_Get_count(&status,MPI_CHAR,&nbytes);
if (nbytes) memcpy(buf,bufcopy,nbytes);
}
if (self || loop < nprocs-1) callback(nbytes/nper,buf,ptr);
}
if (nbytes && outbuf) memcpy(outbuf,buf,nbytes);
memory->destroy(buf);
memory->destroy(bufcopy);
}
/* ----------------------------------------------------------------------
rendezvous communication operation
three stages:
first comm sends inbuf from caller decomp to rvous decomp
callback operates on data in rendezvous decomp
second comm sends outbuf from rvous decomp back to caller decomp
inputs:
which = perform (0) irregular or (1) MPI_All2allv communication
n = # of datums in inbuf
inbuf = vector of input datums
insize = byte size of each input datum
inorder = 0 for inbuf in random proc order, 1 for datums ordered by proc
procs: inorder 0 = proc to send each datum to, 1 = # of datums/proc,
callback = caller function to invoke in rendezvous decomposition
takes input datums, returns output datums
outorder = same as inorder, but for datums returned by callback()
ptr = pointer to caller class, passed to callback()
outputs:
nout = # of output datums (function return)
outbuf = vector of output datums
outsize = byte size of each output datum
callback inputs:
nrvous = # of rvous decomp datums in inbuf_rvous
inbuf_rvous = vector of rvous decomp input datums
ptr = pointer to caller class
callback outputs:
nrvous_out = # of rvous decomp output datums (function return)
flag = 0 for no second comm, 1 for outbuf_rvous = inbuf_rvous,
2 for second comm with new outbuf_rvous
procs_rvous = outorder 0 = proc to send each datum to, 1 = # of datums/proc
allocated
outbuf_rvous = vector of rvous decomp output datums
NOTE: could use MPI_INT or MPI_DOUBLE insead of MPI_CHAR
to avoid checked-for overflow in MPI_Alltoallv?
------------------------------------------------------------------------- */
int Comm::
rendezvous(int which, int n, char *inbuf, int insize,
int inorder, int *procs,
int (*callback)(int, char *, int &, int *&, char *&, void *),
int outorder, char *&outbuf, int outsize, void *ptr, int statflag)
{
if (which == 0)
return rendezvous_irregular(n,inbuf,insize,inorder,procs,callback,
outorder,outbuf,outsize,ptr,statflag);
else
return rendezvous_all2all(n,inbuf,insize,inorder,procs,callback,
outorder,outbuf,outsize,ptr,statflag);
}
/* ---------------------------------------------------------------------- */
int Comm::
rendezvous_irregular(int n, char *inbuf, int insize, int inorder, int *procs,
int (*callback)(int, char *, int &, int *&, char *&, void *),
int outorder, char *&outbuf,
int outsize, void *ptr, int statflag)
{
// irregular comm of inbuf from caller decomp to rendezvous decomp
auto irregular = new Irregular(lmp);
int nrvous;
if (inorder) nrvous = irregular->create_data_grouped(n,procs);
else nrvous = irregular->create_data(n,procs);
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
auto inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize+1, "rendezvous:inbuf");
irregular->exchange_data(inbuf,insize,inbuf_rvous);
bigint irregular1_bytes = irregular->memory_usage();
irregular->destroy_data();
delete irregular;
// peform rendezvous computation via callback()
// callback() allocates/populates proclist_rvous and outbuf_rvous
int flag;
int *procs_rvous;
char *outbuf_rvous;
int nrvous_out = callback(nrvous,inbuf_rvous,flag, procs_rvous,outbuf_rvous,ptr);
if (flag != 1) memory->sfree(inbuf_rvous); // outbuf_rvous = inbuf_vous
if (flag == 0) {
if (statflag) rendezvous_stats(n,0,nrvous,nrvous_out,insize,outsize,
(bigint) nrvous_out*sizeof(int) + irregular1_bytes);
return 0; // all nout_rvous are 0, no 2nd comm stage
}
// irregular comm of outbuf from rendezvous decomp back to caller decomp
// caller will free outbuf
irregular = new Irregular(lmp);
int nout;
if (outorder) nout = irregular->create_data_grouped(nrvous_out,procs_rvous);
else nout = irregular->create_data(nrvous_out,procs_rvous);
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
outbuf = (char *) memory->smalloc((bigint) nout*outsize+1, "rendezvous:outbuf");
irregular->exchange_data(outbuf_rvous,outsize,outbuf);
bigint irregular2_bytes = irregular->memory_usage();
irregular->destroy_data();
delete irregular;
memory->destroy(procs_rvous);
memory->sfree(outbuf_rvous);
// return number of output datums
// last arg to stats() = memory for procs_rvous + irregular comm
if (statflag) rendezvous_stats(n,nout,nrvous,nrvous_out,insize,outsize,
(bigint) nrvous_out*sizeof(int) +
MAX(irregular1_bytes,irregular2_bytes));
return nout;
}
/* ---------------------------------------------------------------------- */
int Comm::
rendezvous_all2all(int n, char *inbuf, int insize, int inorder, int *procs,
int (*callback)(int, char *, int &, int *&, char *&, void *),
int outorder, char *&outbuf, int outsize, void *ptr,
int statflag)
{
int iproc;
bigint all2all1_bytes,all2all2_bytes;
int *sendcount,*sdispls,*recvcount,*rdispls;
int *procs_a2a;
bigint *offsets;
char *inbuf_a2a,*outbuf_a2a;
// create procs and inbuf for All2all if necessary
if (!inorder) {
memory->create(procs_a2a,nprocs,"rendezvous:procs");
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
inbuf_a2a = (char *) memory->smalloc((bigint) n*insize+1,
"rendezvous:inbuf");
memset(inbuf_a2a,0,(bigint)n*insize*sizeof(char));
memory->create(offsets,nprocs,"rendezvous:offsets");
for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
for (int i = 0; i < n; i++) procs_a2a[procs[i]]++;
offsets[0] = 0;
for (int i = 1; i < nprocs; i++)
offsets[i] = offsets[i-1] + (bigint)insize*procs_a2a[i-1];
bigint offset = 0;
for (int i = 0; i < n; i++) {
iproc = procs[i];
memcpy(&inbuf_a2a[offsets[iproc]],&inbuf[offset],insize);
offsets[iproc] += insize;
offset += insize;
}
all2all1_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint)
+ (bigint)n*insize;
} else {
procs_a2a = procs;
inbuf_a2a = inbuf;
all2all1_bytes = 0;
}
// create args for MPI_Alltoallv() on input data
memory->create(sendcount,nprocs,"rendezvous:sendcount");
memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
memory->create(recvcount,nprocs,"rendezvous:recvcount");
MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);
memory->create(sdispls,nprocs,"rendezvous:sdispls");
memory->create(rdispls,nprocs,"rendezvous:rdispls");
sdispls[0] = rdispls[0] = 0;
for (int i = 1; i < nprocs; i++) {
sdispls[i] = sdispls[i-1] + sendcount[i-1];
rdispls[i] = rdispls[i-1] + recvcount[i-1];
}
int nrvous = rdispls[nprocs-1] + recvcount[nprocs-1];
// test for overflow of input data due to imbalance or insize
// means that individual sdispls or rdispls values overflow
int overflow = 0;
if ((bigint) n*insize > MAXSMALLINT) overflow = 1;
if ((bigint) nrvous*insize > MAXSMALLINT) overflow = 1;
int overflowall;
MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
if (overflowall) error->all(FLERR,"Overflow input size in rendezvous_a2a");
for (int i = 0; i < nprocs; i++) {
sendcount[i] *= insize;
sdispls[i] *= insize;
recvcount[i] *= insize;
rdispls[i] *= insize;
}
// all2all comm of inbuf from caller decomp to rendezvous decomp
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
auto inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize+1, "rendezvous:inbuf");
memset(inbuf_rvous,0,(bigint) nrvous*insize*sizeof(char));
MPI_Alltoallv(inbuf_a2a,sendcount,sdispls,MPI_CHAR,
inbuf_rvous,recvcount,rdispls,MPI_CHAR,world);
if (!inorder) {
memory->destroy(procs_a2a);
memory->sfree(inbuf_a2a);
memory->destroy(offsets);
}
// peform rendezvous computation via callback()
// callback() allocates/populates proclist_rvous and outbuf_rvous
int flag;
int *procs_rvous;
char *outbuf_rvous;
int nrvous_out = callback(nrvous,inbuf_rvous,flag, procs_rvous,outbuf_rvous,ptr);
if (flag != 1) memory->sfree(inbuf_rvous); // outbuf_rvous = inbuf_vous
if (flag == 0) {
memory->destroy(sendcount);
memory->destroy(recvcount);
memory->destroy(sdispls);
memory->destroy(rdispls);
if (statflag) rendezvous_stats(n,0,nrvous,nrvous_out,insize,outsize,
(bigint) nrvous_out*sizeof(int) +
4*nprocs*sizeof(int) + all2all1_bytes);
return 0; // all nout_rvous are 0, no 2nd irregular
}
// create procs and outbuf for All2all if necessary
if (!outorder) {
memory->create(procs_a2a,nprocs,"rendezvous_a2a:procs");
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
outbuf_a2a = (char *) memory->smalloc((bigint) nrvous_out*outsize+1, "rendezvous:outbuf");
memory->create(offsets,nprocs,"rendezvous:offsets");
for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
for (int i = 0; i < nrvous_out; i++) procs_a2a[procs_rvous[i]]++;
offsets[0] = 0;
for (int i = 1; i < nprocs; i++)
offsets[i] = offsets[i-1] + (bigint)outsize*procs_a2a[i-1];
bigint offset = 0;
for (int i = 0; i < nrvous_out; i++) {
iproc = procs_rvous[i];
memcpy(&outbuf_a2a[offsets[iproc]],&outbuf_rvous[offset],outsize);
offsets[iproc] += outsize;
offset += outsize;
}
all2all2_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint) + (bigint)nrvous_out*outsize;
} else {
procs_a2a = procs_rvous;
outbuf_a2a = outbuf_rvous;
all2all2_bytes = 0;
}
// comm outbuf from rendezvous decomposition back to caller
memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);
sdispls[0] = rdispls[0] = 0;
for (int i = 1; i < nprocs; i++) {
sdispls[i] = sdispls[i-1] + sendcount[i-1];
rdispls[i] = rdispls[i-1] + recvcount[i-1];
}
int nout = rdispls[nprocs-1] + recvcount[nprocs-1];
// test for overflow of outbuf due to imbalance or outsize
// means that individual sdispls or rdispls values overflow
overflow = 0;
if ((bigint) nrvous*outsize > MAXSMALLINT) overflow = 1;
if ((bigint) nout*outsize > MAXSMALLINT) overflow = 1;
MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
if (overflowall) error->all(FLERR,"Overflow output in rendezvous_a2a");
for (int i = 0; i < nprocs; i++) {
sendcount[i] *= outsize;
sdispls[i] *= outsize;
recvcount[i] *= outsize;
rdispls[i] *= outsize;
}
// all2all comm of outbuf from rendezvous decomp back to caller decomp
// caller will free outbuf
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
outbuf = (char *) memory->smalloc((bigint) nout*outsize+1,"rendezvous:outbuf");
MPI_Alltoallv(outbuf_a2a,sendcount,sdispls,MPI_CHAR,
outbuf,recvcount,rdispls,MPI_CHAR,world);
memory->destroy(procs_rvous);
memory->sfree(outbuf_rvous);
if (!outorder) {
memory->destroy(procs_a2a);
memory->sfree(outbuf_a2a);
memory->destroy(offsets);
}
// clean up
memory->destroy(sendcount);
memory->destroy(recvcount);
memory->destroy(sdispls);
memory->destroy(rdispls);
// return number of output datums
// last arg to stats() = mem for procs_rvous + per-proc vecs + reordering ops
if (statflag) rendezvous_stats(n,nout,nrvous,nrvous_out,insize,outsize,
(bigint) nrvous_out*sizeof(int) +
4*nprocs*sizeof(int) +
MAX(all2all1_bytes,all2all2_bytes));
return nout;
}
/* ----------------------------------------------------------------------
print balance and memory info for rendezvous operation
useful for debugging
------------------------------------------------------------------------- */
void Comm::rendezvous_stats(int n, int nout, int nrvous, int nrvous_out,
int insize, int outsize, bigint commsize)
{
bigint size_in_all,size_in_max,size_in_min;
bigint size_out_all,size_out_max,size_out_min;
bigint size_inrvous_all,size_inrvous_max,size_inrvous_min;
bigint size_outrvous_all,size_outrvous_max,size_outrvous_min;
bigint size_comm_all,size_comm_max,size_comm_min;
bigint size = (bigint) n*insize;
MPI_Allreduce(&size,&size_in_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
MPI_Allreduce(&size,&size_in_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
MPI_Allreduce(&size,&size_in_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
size = (bigint) nout*outsize;
MPI_Allreduce(&size,&size_out_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
MPI_Allreduce(&size,&size_out_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
MPI_Allreduce(&size,&size_out_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
size = (bigint) nrvous*insize;
MPI_Allreduce(&size,&size_inrvous_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
MPI_Allreduce(&size,&size_inrvous_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
MPI_Allreduce(&size,&size_inrvous_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
size = (bigint) nrvous_out*insize;
MPI_Allreduce(&size,&size_outrvous_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
MPI_Allreduce(&size,&size_outrvous_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
MPI_Allreduce(&size,&size_outrvous_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
size = commsize;
MPI_Allreduce(&size,&size_comm_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
MPI_Allreduce(&size,&size_comm_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
MPI_Allreduce(&size,&size_comm_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
int mbytes = 1024*1024;
if (me == 0) {
std::string mesg = "Rendezvous balance and memory info: (tot,ave,max,min) \n";
mesg += fmt::format(" input datum count: {} {} {} {}\n",
size_in_all/insize,1.0*size_in_all/nprocs/insize,
size_in_max/insize,size_in_min/insize);
mesg += fmt::format(" input data (MB): {:.6} {:.6} {:.6} {:.6}\n",
1.0*size_in_all/mbytes,1.0*size_in_all/nprocs/mbytes,
1.0*size_in_max/mbytes,1.0*size_in_min/mbytes);
if (outsize)
mesg += fmt::format(" output datum count: {} {} {} {}\n",
size_out_all/outsize,1.0*size_out_all/nprocs/outsize,
size_out_max/outsize,size_out_min/outsize);
else
mesg += fmt::format(" output datum count: {} {:.6} {} {}\n",0,0.0,0,0);
mesg += fmt::format(" output data (MB): {:.6} {:.6} {:.6} {:.6}\n",
1.0*size_out_all/mbytes,1.0*size_out_all/nprocs/mbytes,
1.0*size_out_max/mbytes,1.0*size_out_min/mbytes);
mesg += fmt::format(" input rvous datum count: {} {} {} {}\n",
size_inrvous_all/insize,1.0*size_inrvous_all/nprocs/insize,
size_inrvous_max/insize,size_inrvous_min/insize);
mesg += fmt::format(" input rvous data (MB): {:.6} {:.6} {:.6} {:.6}\n",
1.0*size_inrvous_all/mbytes,1.0*size_inrvous_all/nprocs/mbytes,
1.0*size_inrvous_max/mbytes,1.0*size_inrvous_min/mbytes);
if (outsize)
mesg += fmt::format(" output rvous datum count: {} {} {} {}\n",
size_outrvous_all/outsize,1.0*size_outrvous_all/nprocs/outsize,
size_outrvous_max/outsize,size_outrvous_min/outsize);
else
mesg += fmt::format(" output rvous datum count: {} {:.6} {} {}\n",0,0.0,0,0);
mesg += fmt::format(" output rvous data (MB): {:.6} {:.6} {:.6} {:.6}\n",
1.0*size_outrvous_all/mbytes,1.0*size_outrvous_all/nprocs/mbytes,
1.0*size_outrvous_max/mbytes,1.0*size_outrvous_min/mbytes);
mesg += fmt::format(" rvous comm (MB): {:.6} {:.6} {:.6} {:.6}\n",
1.0*size_comm_all/mbytes,1.0*size_comm_all/nprocs/mbytes,
1.0*size_comm_max/mbytes,1.0*size_comm_min/mbytes);
utils::logmesg(lmp,mesg);
}
}