1372 lines
51 KiB
C++
1372 lines
51 KiB
C++
// clang-format off
|
|
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
https://www.lammps.org/, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
#include "comm.h"
|
|
|
|
#include "accelerator_kokkos.h"
|
|
#include "atom.h" // IWYU pragma: keep
|
|
#include "atom_vec.h"
|
|
#include "bond.h"
|
|
#include "compute.h"
|
|
#include "domain.h" // IWYU pragma: keep
|
|
#include "dump.h"
|
|
#include "error.h"
|
|
#include "fix.h"
|
|
#include "force.h"
|
|
#include "group.h"
|
|
#include "irregular.h"
|
|
#include "memory.h" // IWYU pragma: keep
|
|
#include "modify.h"
|
|
#include "neighbor.h" // IWYU pragma: keep
|
|
#include "output.h"
|
|
#include "pair.h"
|
|
#include "procmap.h"
|
|
#include "universe.h"
|
|
#include "update.h"
|
|
|
|
#include <cstring>
|
|
#ifdef _OPENMP
|
|
#include <omp.h>
|
|
#endif
|
|
|
|
using namespace LAMMPS_NS;
|
|
|
|
#define BUFEXTRA 1024
|
|
|
|
enum{ONELEVEL,TWOLEVEL,NUMA,CUSTOM};
|
|
enum{CART,CARTREORDER,XYZ};
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
Comm::Comm(LAMMPS *lmp) : Pointers(lmp)
|
|
{
|
|
MPI_Comm_rank(world,&me);
|
|
MPI_Comm_size(world,&nprocs);
|
|
|
|
mode = 0;
|
|
bordergroup = 0;
|
|
cutghostuser = 0.0;
|
|
cutusermulti = nullptr;
|
|
cutusermultiold = nullptr;
|
|
ncollections = 0;
|
|
ncollections_cutoff = 0;
|
|
ghost_velocity = 0;
|
|
|
|
user_procgrid[0] = user_procgrid[1] = user_procgrid[2] = 0;
|
|
coregrid[0] = coregrid[1] = coregrid[2] = 1;
|
|
gridflag = ONELEVEL;
|
|
mapflag = CART;
|
|
customfile = nullptr;
|
|
outfile = nullptr;
|
|
recv_from_partition = send_to_partition = -1;
|
|
otherflag = 0;
|
|
|
|
maxexchange = maxexchange_atom = maxexchange_fix = 0;
|
|
maxexchange_fix_dynamic = 0;
|
|
bufextra = BUFEXTRA;
|
|
|
|
grid2proc = nullptr;
|
|
xsplit = ysplit = zsplit = nullptr;
|
|
rcbnew = 0;
|
|
multi_reduce = 0;
|
|
|
|
// use of OpenMP threads
|
|
// query OpenMP for number of threads/process set by user at run-time
|
|
// if the OMP_NUM_THREADS environment variable is not set, we default
|
|
// to using 1 thread. This follows the principle of the least surprise,
|
|
// while practically all OpenMP implementations violate it by using
|
|
// as many threads as there are (virtual) CPU cores by default.
|
|
|
|
nthreads = 1;
|
|
#ifdef _OPENMP
|
|
if (lmp->kokkos) {
|
|
nthreads = lmp->kokkos->nthreads * lmp->kokkos->numa;
|
|
} else if (getenv("OMP_NUM_THREADS") == nullptr) {
|
|
nthreads = 1;
|
|
if (me == 0)
|
|
error->message(FLERR,"OMP_NUM_THREADS environment is not set. "
|
|
"Defaulting to 1 thread.");
|
|
} else {
|
|
nthreads = omp_get_max_threads();
|
|
}
|
|
|
|
// enforce consistent number of threads across all MPI tasks
|
|
|
|
MPI_Bcast(&nthreads,1,MPI_INT,0,world);
|
|
if (!lmp->kokkos) omp_set_num_threads(nthreads);
|
|
|
|
if (me == 0)
|
|
utils::logmesg(lmp," using {} OpenMP thread(s) per MPI task\n",nthreads);
|
|
#endif
|
|
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
Comm::~Comm()
|
|
{
|
|
memory->destroy(grid2proc);
|
|
memory->destroy(xsplit);
|
|
memory->destroy(ysplit);
|
|
memory->destroy(zsplit);
|
|
memory->destroy(cutusermulti);
|
|
memory->destroy(cutusermultiold);
|
|
delete [] customfile;
|
|
delete [] outfile;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
deep copy of arrays from old Comm class to new one
|
|
all public/protected vectors/arrays in parent Comm class must be copied
|
|
called from alternate constructor of child classes
|
|
when new comm style is created from Input
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::copy_arrays(Comm *oldcomm)
|
|
{
|
|
if (oldcomm->grid2proc) {
|
|
memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
|
|
"comm:grid2proc");
|
|
memcpy(&grid2proc[0][0][0],&oldcomm->grid2proc[0][0][0],
|
|
(procgrid[0]*procgrid[1]*procgrid[2])*sizeof(int));
|
|
|
|
memory->create(xsplit,procgrid[0]+1,"comm:xsplit");
|
|
memory->create(ysplit,procgrid[1]+1,"comm:ysplit");
|
|
memory->create(zsplit,procgrid[2]+1,"comm:zsplit");
|
|
memcpy(xsplit,oldcomm->xsplit,(procgrid[0]+1)*sizeof(double));
|
|
memcpy(ysplit,oldcomm->ysplit,(procgrid[1]+1)*sizeof(double));
|
|
memcpy(zsplit,oldcomm->zsplit,(procgrid[2]+1)*sizeof(double));
|
|
}
|
|
|
|
ncollections = oldcomm->ncollections;
|
|
ncollections_cutoff = oldcomm->ncollections_cutoff;
|
|
if (oldcomm->cutusermulti) {
|
|
memory->create(cutusermulti,ncollections_cutoff,"comm:cutusermulti");
|
|
memcpy(cutusermulti,oldcomm->cutusermulti,ncollections_cutoff);
|
|
}
|
|
|
|
if (oldcomm->cutusermultiold) {
|
|
memory->create(cutusermultiold,atom->ntypes+1,"comm:cutusermultiold");
|
|
memcpy(cutusermultiold,oldcomm->cutusermultiold,atom->ntypes+1);
|
|
}
|
|
|
|
if (customfile)
|
|
customfile = utils::strdup(oldcomm->customfile);
|
|
|
|
if (outfile)
|
|
outfile = utils::strdup(oldcomm->outfile);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
common to all Comm styles
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::init()
|
|
{
|
|
triclinic = domain->triclinic;
|
|
map_style = atom->map_style;
|
|
|
|
// check warn if any proc's subbox is smaller than neigh skin
|
|
// since may lead to lost atoms in exchange()
|
|
// really should check every exchange() in case box size is shrinking
|
|
// but seems overkill to do that (fix balance does perform this check)
|
|
|
|
domain->subbox_too_small_check(neighbor->skin);
|
|
|
|
// comm_only = 1 if only x,f are exchanged in forward/reverse comm
|
|
// comm_x_only = 0 if ghost_velocity since velocities are added
|
|
|
|
comm_x_only = atom->avec->comm_x_only;
|
|
comm_f_only = atom->avec->comm_f_only;
|
|
if (ghost_velocity) comm_x_only = 0;
|
|
|
|
// set per-atom sizes for forward/reverse/border comm
|
|
// augment by velocity and fix quantities if needed
|
|
|
|
size_forward = atom->avec->size_forward;
|
|
size_reverse = atom->avec->size_reverse;
|
|
size_border = atom->avec->size_border;
|
|
|
|
if (ghost_velocity) size_forward += atom->avec->size_velocity;
|
|
if (ghost_velocity) size_border += atom->avec->size_velocity;
|
|
|
|
const auto &fix_list = modify->get_fix_list();
|
|
for (const auto &fix : fix_list)
|
|
size_border += fix->comm_border;
|
|
|
|
// per-atom limits for communication
|
|
// maxexchange = max # of datums in exchange comm, set in exchange()
|
|
// maxforward = # of datums in largest forward comm
|
|
// maxreverse = # of datums in largest reverse comm
|
|
// query pair,fix,compute,dump for their requirements
|
|
// pair style can force reverse comm even if newton off
|
|
|
|
maxforward = MAX(size_forward,size_border);
|
|
maxreverse = size_reverse;
|
|
|
|
if (force->pair) maxforward = MAX(maxforward,force->pair->comm_forward);
|
|
if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse);
|
|
|
|
if (force->bond) maxforward = MAX(maxforward,force->bond->comm_forward);
|
|
if (force->bond) maxreverse = MAX(maxreverse,force->bond->comm_reverse);
|
|
|
|
for (const auto &fix : fix_list) {
|
|
maxforward = MAX(maxforward, fix->comm_forward);
|
|
maxreverse = MAX(maxreverse, fix->comm_reverse);
|
|
}
|
|
|
|
for (const auto &compute : modify->get_compute_list()) {
|
|
maxforward = MAX(maxforward,compute->comm_forward);
|
|
maxreverse = MAX(maxreverse,compute->comm_reverse);
|
|
}
|
|
|
|
for (const auto &dump: output->get_dump_list()) {
|
|
maxforward = MAX(maxforward,dump->comm_forward);
|
|
maxreverse = MAX(maxreverse,dump->comm_reverse);
|
|
}
|
|
|
|
if (force->newton == 0) maxreverse = 0;
|
|
if (force->pair) maxreverse = MAX(maxreverse,force->pair->comm_reverse_off);
|
|
if (force->bond) maxreverse = MAX(maxreverse,force->bond->comm_reverse_off);
|
|
|
|
// maxexchange_atom = size of an exchanged atom, set by AtomVec
|
|
// only needs to be set if size > BUFEXTRA
|
|
// maxexchange_fix_dynamic = 1 if any fix sets its maxexchange dynamically
|
|
|
|
maxexchange_atom = atom->avec->maxexchange;
|
|
|
|
maxexchange_fix_dynamic = 0;
|
|
for (const auto &fix : fix_list) if (fix->maxexchange_dynamic) maxexchange_fix_dynamic = 1;
|
|
|
|
if ((mode == Comm::MULTI) && (neighbor->style != Neighbor::MULTI))
|
|
error->all(FLERR,"Cannot use comm mode multi without multi-style neighbor lists");
|
|
|
|
if (multi_reduce) {
|
|
if (force->newton == 0)
|
|
error->all(FLERR,"Cannot use multi/reduce communication with Newton off");
|
|
if (neighbor->any_full())
|
|
error->all(FLERR,"Cannot use multi/reduce communication with a full neighbor list");
|
|
if (mode != Comm::MULTI)
|
|
error->all(FLERR,"Cannot use multi/reduce communication without mode multi");
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
set maxexchange based on AtomVec and fixes
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::init_exchange()
|
|
{
|
|
maxexchange_fix = 0;
|
|
for (const auto &fix : modify->get_fix_list()) maxexchange_fix += fix->maxexchange;
|
|
|
|
maxexchange = maxexchange_atom + maxexchange_fix;
|
|
bufextra = maxexchange + BUFEXTRA;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
modify communication params
|
|
invoked from input script by comm_modify command
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::modify_params(int narg, char **arg)
|
|
{
|
|
if (narg < 1) utils::missing_cmd_args(FLERR, "comm_modify", error);
|
|
|
|
int iarg = 0;
|
|
while (iarg < narg) {
|
|
if (strcmp(arg[iarg],"mode") == 0) {
|
|
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify mode", error);
|
|
if (strcmp(arg[iarg+1],"single") == 0) {
|
|
// need to reset cutghostuser when switching comm mode
|
|
if (mode == Comm::MULTI) cutghostuser = 0.0;
|
|
if (mode == Comm::MULTIOLD) cutghostuser = 0.0;
|
|
memory->destroy(cutusermulti);
|
|
memory->destroy(cutusermultiold);
|
|
mode = Comm::SINGLE;
|
|
} else if (strcmp(arg[iarg+1],"multi") == 0) {
|
|
if (neighbor->style != Neighbor::MULTI)
|
|
error->all(FLERR,"Cannot use comm mode 'multi' without 'multi' style neighbor lists");
|
|
// need to reset cutghostuser when switching comm mode
|
|
if (mode == Comm::SINGLE) cutghostuser = 0.0;
|
|
if (mode == Comm::MULTIOLD) cutghostuser = 0.0;
|
|
memory->destroy(cutusermultiold);
|
|
mode = Comm::MULTI;
|
|
} else if (strcmp(arg[iarg+1],"multi/old") == 0) {
|
|
if (neighbor->style == Neighbor::MULTI)
|
|
error->all(FLERR,"Cannot use comm mode 'multi/old' with 'multi' style neighbor lists");
|
|
// need to reset cutghostuser when switching comm mode
|
|
if (mode == Comm::SINGLE) cutghostuser = 0.0;
|
|
if (mode == Comm::MULTI) cutghostuser = 0.0;
|
|
memory->destroy(cutusermulti);
|
|
mode = Comm::MULTIOLD;
|
|
} else error->all(FLERR,"Unknown comm_modify mode argument: {}", arg[iarg+1]);
|
|
iarg += 2;
|
|
} else if (strcmp(arg[iarg],"group") == 0) {
|
|
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify group", error);
|
|
bordergroup = group->find(arg[iarg+1]);
|
|
if (bordergroup < 0)
|
|
error->all(FLERR, "Invalid comm_modify keyword: group {} not found", arg[iarg+1]);
|
|
if (bordergroup && ((atom->firstgroupname == nullptr) || strcmp(arg[iarg+1],atom->firstgroupname) != 0))
|
|
error->all(FLERR, "Comm_modify group != atom_modify first group: {}", atom->firstgroupname);
|
|
iarg += 2;
|
|
} else if (strcmp(arg[iarg],"cutoff") == 0) {
|
|
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify cutoff", error);
|
|
if (mode == Comm::MULTI)
|
|
error->all(FLERR, "Use cutoff/multi keyword to set cutoff in multi mode");
|
|
if (mode == Comm::MULTIOLD)
|
|
error->all(FLERR, "Use cutoff/multi/old keyword to set cutoff in multi mode");
|
|
cutghostuser = utils::numeric(FLERR,arg[iarg+1],false,lmp);
|
|
if (cutghostuser < 0.0)
|
|
error->all(FLERR,"Invalid cutoff {} in comm_modify command", arg[iarg+1]);
|
|
iarg += 2;
|
|
} else if (strcmp(arg[iarg],"cutoff/multi") == 0) {
|
|
int i,nlo,nhi;
|
|
double cut;
|
|
if (mode == Comm::SINGLE)
|
|
error->all(FLERR,"Use cutoff keyword to set cutoff in single mode");
|
|
if (mode == Comm::MULTIOLD)
|
|
error->all(FLERR,"Use cutoff/multi/old keyword to set cutoff in multi/old mode");
|
|
if (domain->box_exist == 0)
|
|
error->all(FLERR, "Cannot set cutoff/multi before simulation box is defined");
|
|
|
|
// Check if # of collections has changed, if so erase any previously defined cutoffs
|
|
// Neighbor will reset ncollections if collections are redefined
|
|
if (! cutusermulti || ncollections_cutoff != neighbor->ncollections) {
|
|
ncollections_cutoff = neighbor->ncollections;
|
|
memory->destroy(cutusermulti);
|
|
memory->create(cutusermulti,ncollections_cutoff,"comm:cutusermulti");
|
|
for (i=0; i < ncollections_cutoff; ++i)
|
|
cutusermulti[i] = -1.0;
|
|
}
|
|
utils::bounds(FLERR,arg[iarg+1],1,ncollections_cutoff,nlo,nhi,error);
|
|
cut = utils::numeric(FLERR,arg[iarg+2],false,lmp);
|
|
cutghostuser = MAX(cutghostuser,cut);
|
|
if (cut < 0.0)
|
|
error->all(FLERR,"Invalid cutoff {} in comm_modify command", arg[iarg+2]);
|
|
// collections use 1-based indexing externally and 0-based indexing internally
|
|
for (i=nlo; i<=nhi; ++i)
|
|
cutusermulti[i-1] = cut;
|
|
iarg += 3;
|
|
} else if (strcmp(arg[iarg],"cutoff/multi/old") == 0) {
|
|
int i,nlo,nhi;
|
|
double cut;
|
|
if (mode == Comm::SINGLE)
|
|
error->all(FLERR,"Use cutoff keyword to set cutoff in single mode");
|
|
if (mode == Comm::MULTI)
|
|
error->all(FLERR,"Use cutoff/multi keyword to set cutoff in multi mode");
|
|
if (domain->box_exist == 0)
|
|
error->all(FLERR, "Cannot set cutoff/multi before simulation box is defined");
|
|
const int ntypes = atom->ntypes;
|
|
if (iarg+3 > narg) utils::missing_cmd_args(FLERR, "comm_modify cutoff/multi/old", error);
|
|
if (cutusermultiold == nullptr) {
|
|
memory->create(cutusermultiold,ntypes+1,"comm:cutusermultiold");
|
|
for (i=0; i < ntypes+1; ++i)
|
|
cutusermultiold[i] = -1.0;
|
|
}
|
|
utils::bounds(FLERR,arg[iarg+1],1,ntypes,nlo,nhi,error);
|
|
cut = utils::numeric(FLERR,arg[iarg+2],false,lmp);
|
|
cutghostuser = MAX(cutghostuser,cut);
|
|
if (cut < 0.0)
|
|
error->all(FLERR,"Invalid cutoff {} in comm_modify command", arg[iarg+2]);
|
|
for (i=nlo; i<=nhi; ++i)
|
|
cutusermultiold[i] = cut;
|
|
iarg += 3;
|
|
} else if (strcmp(arg[iarg],"reduce/multi") == 0) {
|
|
if (mode == Comm::SINGLE)
|
|
error->all(FLERR,"Use reduce/multi in mode multi only");
|
|
multi_reduce = 1;
|
|
iarg += 1;
|
|
} else if (strcmp(arg[iarg],"vel") == 0) {
|
|
if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "comm_modify vel", error);
|
|
ghost_velocity = utils::logical(FLERR,arg[iarg+1],false,lmp);
|
|
iarg += 2;
|
|
} else error->all(FLERR,"Unknown comm_modify keyword: {}", arg[iarg]);
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
set dimensions for 3d grid of processors, and associated flags
|
|
invoked from input script by processors command
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::set_processors(int narg, char **arg)
|
|
{
|
|
if (narg < 3) error->all(FLERR,"Illegal processors command");
|
|
|
|
if (strcmp(arg[0],"*") == 0) user_procgrid[0] = 0;
|
|
else user_procgrid[0] = utils::inumeric(FLERR,arg[0],false,lmp);
|
|
if (strcmp(arg[1],"*") == 0) user_procgrid[1] = 0;
|
|
else user_procgrid[1] = utils::inumeric(FLERR,arg[1],false,lmp);
|
|
if (strcmp(arg[2],"*") == 0) user_procgrid[2] = 0;
|
|
else user_procgrid[2] = utils::inumeric(FLERR,arg[2],false,lmp);
|
|
|
|
if (user_procgrid[0] < 0 || user_procgrid[1] < 0 || user_procgrid[2] < 0)
|
|
error->all(FLERR,"Illegal processors command");
|
|
|
|
int p = user_procgrid[0]*user_procgrid[1]*user_procgrid[2];
|
|
if (p && p != nprocs)
|
|
error->all(FLERR,"Specified processors != physical processors");
|
|
|
|
int iarg = 3;
|
|
while (iarg < narg) {
|
|
if (strcmp(arg[iarg],"grid") == 0) {
|
|
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
|
|
|
|
if (strcmp(arg[iarg+1],"onelevel") == 0) {
|
|
gridflag = ONELEVEL;
|
|
|
|
} else if (strcmp(arg[iarg+1],"twolevel") == 0) {
|
|
if (iarg+6 > narg) error->all(FLERR,"Illegal processors command");
|
|
gridflag = TWOLEVEL;
|
|
|
|
ncores = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
|
|
if (strcmp(arg[iarg+3],"*") == 0) user_coregrid[0] = 0;
|
|
else user_coregrid[0] = utils::inumeric(FLERR,arg[iarg+3],false,lmp);
|
|
if (strcmp(arg[iarg+4],"*") == 0) user_coregrid[1] = 0;
|
|
else user_coregrid[1] = utils::inumeric(FLERR,arg[iarg+4],false,lmp);
|
|
if (strcmp(arg[iarg+5],"*") == 0) user_coregrid[2] = 0;
|
|
else user_coregrid[2] = utils::inumeric(FLERR,arg[iarg+5],false,lmp);
|
|
|
|
if (ncores <= 0 || user_coregrid[0] < 0 ||
|
|
user_coregrid[1] < 0 || user_coregrid[2] < 0)
|
|
error->all(FLERR,"Illegal processors command");
|
|
iarg += 4;
|
|
|
|
} else if (strcmp(arg[iarg+1],"numa") == 0) {
|
|
gridflag = NUMA;
|
|
|
|
} else if (strcmp(arg[iarg+1],"custom") == 0) {
|
|
if (iarg+3 > narg) error->all(FLERR,"Illegal processors command");
|
|
gridflag = CUSTOM;
|
|
delete [] customfile;
|
|
customfile = utils::strdup(arg[iarg+2]);
|
|
iarg += 1;
|
|
|
|
} else error->all(FLERR,"Illegal processors command");
|
|
iarg += 2;
|
|
|
|
} else if (strcmp(arg[iarg],"map") == 0) {
|
|
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
|
|
if (strcmp(arg[iarg+1],"cart") == 0) mapflag = CART;
|
|
else if (strcmp(arg[iarg+1],"cart/reorder") == 0) mapflag = CARTREORDER;
|
|
else if (strcmp(arg[iarg+1],"xyz") == 0 ||
|
|
strcmp(arg[iarg+1],"xzy") == 0 ||
|
|
strcmp(arg[iarg+1],"yxz") == 0 ||
|
|
strcmp(arg[iarg+1],"yzx") == 0 ||
|
|
strcmp(arg[iarg+1],"zxy") == 0 ||
|
|
strcmp(arg[iarg+1],"zyx") == 0) {
|
|
mapflag = XYZ;
|
|
strncpy(xyz,arg[iarg+1],3);
|
|
} else error->all(FLERR,"Illegal processors command");
|
|
iarg += 2;
|
|
|
|
} else if (strcmp(arg[iarg],"part") == 0) {
|
|
if (iarg+4 > narg) error->all(FLERR,"Illegal processors command");
|
|
if (universe->nworlds == 1)
|
|
error->all(FLERR,
|
|
"Cannot use processors part command "
|
|
"without using partitions");
|
|
int isend = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
|
|
int irecv = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
|
|
if (isend < 1 || isend > universe->nworlds ||
|
|
irecv < 1 || irecv > universe->nworlds || isend == irecv)
|
|
error->all(FLERR,"Invalid partitions in processors part command");
|
|
if (isend-1 == universe->iworld) {
|
|
if (send_to_partition >= 0)
|
|
error->all(FLERR,
|
|
"Sending partition in processors part command "
|
|
"is already a sender");
|
|
send_to_partition = irecv-1;
|
|
}
|
|
if (irecv-1 == universe->iworld) {
|
|
if (recv_from_partition >= 0)
|
|
error->all(FLERR,
|
|
"Receiving partition in processors part command "
|
|
"is already a receiver");
|
|
recv_from_partition = isend-1;
|
|
}
|
|
|
|
// only receiver has otherflag dependency
|
|
|
|
if (strcmp(arg[iarg+3],"multiple") == 0) {
|
|
if (universe->iworld == irecv-1) {
|
|
otherflag = 1;
|
|
other_style = Comm::MULTIPLE;
|
|
}
|
|
} else error->all(FLERR,"Illegal processors command");
|
|
iarg += 4;
|
|
|
|
} else if (strcmp(arg[iarg],"file") == 0) {
|
|
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
|
|
delete [] outfile;
|
|
outfile = utils::strdup(arg[iarg+1]);
|
|
iarg += 2;
|
|
|
|
} else error->all(FLERR,"Illegal processors command");
|
|
}
|
|
|
|
// error checks
|
|
|
|
if (gridflag == NUMA && mapflag != CART)
|
|
error->all(FLERR,"Processors grid numa and map style are incompatible");
|
|
if (otherflag && (gridflag == NUMA || gridflag == CUSTOM))
|
|
error->all(FLERR,
|
|
"Processors part option and grid style are incompatible");
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
create a 3d grid of procs based on Nprocs and box size & shape
|
|
map processors to grid, setup xyz split for a uniform grid
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::set_proc_grid(int outflag)
|
|
{
|
|
// recv 3d proc grid of another partition if my 3d grid depends on it
|
|
|
|
if (recv_from_partition >= 0) {
|
|
if (me == 0) {
|
|
MPI_Recv(other_procgrid,3,MPI_INT,
|
|
universe->root_proc[recv_from_partition],0,
|
|
universe->uworld,MPI_STATUS_IGNORE);
|
|
MPI_Recv(other_coregrid,3,MPI_INT,
|
|
universe->root_proc[recv_from_partition],0,
|
|
universe->uworld,MPI_STATUS_IGNORE);
|
|
}
|
|
MPI_Bcast(other_procgrid,3,MPI_INT,0,world);
|
|
MPI_Bcast(other_coregrid,3,MPI_INT,0,world);
|
|
}
|
|
|
|
// create ProcMap class to create 3d grid and map procs to it
|
|
|
|
auto pmap = new ProcMap(lmp);
|
|
|
|
// create 3d grid of processors
|
|
// produces procgrid and coregrid (if relevant)
|
|
|
|
if (gridflag == ONELEVEL) {
|
|
pmap->onelevel_grid(nprocs,user_procgrid,procgrid,
|
|
otherflag,other_style,other_procgrid,other_coregrid);
|
|
|
|
} else if (gridflag == TWOLEVEL) {
|
|
pmap->twolevel_grid(nprocs,user_procgrid,procgrid,
|
|
ncores,user_coregrid,coregrid,
|
|
otherflag,other_style,other_procgrid,other_coregrid);
|
|
|
|
} else if (gridflag == NUMA) {
|
|
pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
|
|
|
|
} else if (gridflag == CUSTOM) {
|
|
pmap->custom_grid(customfile,nprocs,user_procgrid,procgrid);
|
|
}
|
|
|
|
// error check on procgrid
|
|
// should not be necessary due to ProcMap
|
|
|
|
if (procgrid[0]*procgrid[1]*procgrid[2] != nprocs)
|
|
error->all(FLERR,"Bad grid of processors");
|
|
if (domain->dimension == 2 && procgrid[2] != 1)
|
|
error->all(FLERR,"Processor count in z must be 1 for 2d simulation");
|
|
|
|
// grid2proc[i][j][k] = proc that owns i,j,k location in 3d grid
|
|
|
|
if (grid2proc) memory->destroy(grid2proc);
|
|
memory->create(grid2proc,procgrid[0],procgrid[1],procgrid[2],
|
|
"comm:grid2proc");
|
|
|
|
// map processor IDs to 3d processor grid
|
|
// produces myloc, procneigh, grid2proc
|
|
|
|
if (gridflag == ONELEVEL) {
|
|
if (mapflag == CART)
|
|
pmap->cart_map(0,procgrid,myloc,procneigh,grid2proc);
|
|
else if (mapflag == CARTREORDER)
|
|
pmap->cart_map(1,procgrid,myloc,procneigh,grid2proc);
|
|
else if (mapflag == XYZ)
|
|
pmap->xyz_map(xyz,procgrid,myloc,procneigh,grid2proc);
|
|
|
|
} else if (gridflag == TWOLEVEL) {
|
|
if (mapflag == CART)
|
|
pmap->cart_map(0,procgrid,ncores,coregrid,myloc,procneigh,grid2proc);
|
|
else if (mapflag == CARTREORDER)
|
|
pmap->cart_map(1,procgrid,ncores,coregrid,myloc,procneigh,grid2proc);
|
|
else if (mapflag == XYZ)
|
|
pmap->xyz_map(xyz,procgrid,ncores,coregrid,myloc,procneigh,grid2proc);
|
|
|
|
} else if (gridflag == NUMA) {
|
|
pmap->numa_map(0,coregrid,myloc,procneigh,grid2proc);
|
|
|
|
} else if (gridflag == CUSTOM) {
|
|
pmap->custom_map(procgrid,myloc,procneigh,grid2proc);
|
|
}
|
|
|
|
// print 3d grid info to screen and logfile
|
|
|
|
if (outflag && me == 0) {
|
|
auto mesg = fmt::format(" {} by {} by {} MPI processor grid\n",
|
|
procgrid[0],procgrid[1],procgrid[2]);
|
|
if (gridflag == NUMA || gridflag == TWOLEVEL)
|
|
mesg += fmt::format(" {} by {} by {} core grid within node\n",
|
|
coregrid[0],coregrid[1],coregrid[2]);
|
|
utils::logmesg(lmp,mesg);
|
|
}
|
|
|
|
// print 3d grid details to outfile
|
|
|
|
if (outfile) pmap->output(outfile,procgrid,grid2proc);
|
|
|
|
// free ProcMap class
|
|
|
|
delete pmap;
|
|
|
|
// set xsplit,ysplit,zsplit for uniform spacings
|
|
|
|
memory->destroy(xsplit);
|
|
memory->destroy(ysplit);
|
|
memory->destroy(zsplit);
|
|
|
|
memory->create(xsplit,procgrid[0]+1,"comm:xsplit");
|
|
memory->create(ysplit,procgrid[1]+1,"comm:ysplit");
|
|
memory->create(zsplit,procgrid[2]+1,"comm:zsplit");
|
|
|
|
for (int i = 0; i < procgrid[0]; i++) xsplit[i] = i * 1.0/procgrid[0];
|
|
for (int i = 0; i < procgrid[1]; i++) ysplit[i] = i * 1.0/procgrid[1];
|
|
for (int i = 0; i < procgrid[2]; i++) zsplit[i] = i * 1.0/procgrid[2];
|
|
|
|
xsplit[procgrid[0]] = ysplit[procgrid[1]] = zsplit[procgrid[2]] = 1.0;
|
|
|
|
// set lamda box params after procs are assigned
|
|
// only set once unless load-balancing occurs
|
|
|
|
if (domain->triclinic) domain->set_lamda_box();
|
|
|
|
// send my 3d proc grid to another partition if requested
|
|
|
|
if (send_to_partition >= 0) {
|
|
if (me == 0) {
|
|
MPI_Send(procgrid,3,MPI_INT,
|
|
universe->root_proc[send_to_partition],0,
|
|
universe->uworld);
|
|
MPI_Send(coregrid,3,MPI_INT,
|
|
universe->root_proc[send_to_partition],0,
|
|
universe->uworld);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
determine suitable communication cutoff.
|
|
this uses three inputs: 1) maximum neighborlist cutoff, 2) an estimate
|
|
based on bond lengths and bonded interaction styles present, and 3) a
|
|
user supplied communication cutoff.
|
|
the neighbor list cutoff (1) is *always* used, since it is a requirement
|
|
for neighborlists working correctly. the bond length based cutoff is
|
|
*only* used, if no pair style is defined and no user cutoff is provided.
|
|
otherwise, a warning is printed. if the bond length based estimate is
|
|
larger than what is used.
|
|
print a warning, if a user specified communication cutoff is overridden.
|
|
------------------------------------------------------------------------- */
|
|
|
|
double Comm::get_comm_cutoff()
|
|
{
|
|
double maxcommcutoff, maxbondcutoff = 0.0;
|
|
|
|
if (force->bond) {
|
|
int n = atom->nbondtypes;
|
|
for (int i = 1; i <= n; ++i)
|
|
maxbondcutoff = MAX(maxbondcutoff,force->bond->equilibrium_distance(i));
|
|
|
|
// apply bond length based heuristics.
|
|
|
|
if (force->newton_bond) {
|
|
if (force->dihedral || force->improper) {
|
|
maxbondcutoff *= 2.25;
|
|
} else {
|
|
maxbondcutoff *=1.5;
|
|
}
|
|
} else {
|
|
if (force->dihedral || force->improper) {
|
|
maxbondcutoff *= 3.125;
|
|
} else if (force->angle) {
|
|
maxbondcutoff *= 2.25;
|
|
} else {
|
|
maxbondcutoff *=1.5;
|
|
}
|
|
}
|
|
maxbondcutoff += neighbor->skin;
|
|
}
|
|
|
|
// always take the larger of max neighbor list and user specified cutoff
|
|
|
|
maxcommcutoff = MAX(cutghostuser,neighbor->cutneighmax);
|
|
|
|
// use cutoff estimate from bond length only if no user specified
|
|
// cutoff was given and no pair style present. Otherwise print a
|
|
// warning, if the estimated bond based cutoff is larger than what
|
|
// is currently used.
|
|
|
|
if (!force->pair && (cutghostuser == 0.0)) {
|
|
maxcommcutoff = MAX(maxcommcutoff,maxbondcutoff);
|
|
} else {
|
|
if ((me == 0) && (maxbondcutoff > maxcommcutoff))
|
|
error->warning(FLERR,"Communication cutoff {} is shorter than a bond "
|
|
"length based estimate of {}. This may lead to errors.",
|
|
maxcommcutoff,maxbondcutoff);
|
|
}
|
|
|
|
// print warning if neighborlist cutoff overrides user cutoff
|
|
|
|
if ((me == 0) && (update->setupflag == 1)) {
|
|
if ((cutghostuser > 0.0) && (maxcommcutoff > cutghostuser))
|
|
error->warning(FLERR,"Communication cutoff adjusted to {}",maxcommcutoff);
|
|
}
|
|
|
|
// check maximum interval size for neighbor multi
|
|
|
|
if (neighbor->interval_collection_flag) {
|
|
for (int i = 0; i < neighbor->ncollections; i++){
|
|
maxcommcutoff = MAX(maxcommcutoff, neighbor->collection2cut[i]);
|
|
}
|
|
}
|
|
|
|
return maxcommcutoff;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
determine which proc owns atom with coord x[3] based on current decomp
|
|
x will be in box (orthogonal) or lamda coords (triclinic)
|
|
if layout = UNIFORM, calculate owning proc directly
|
|
if layout = NONUNIFORM, iteratively find owning proc via binary search
|
|
if layout = TILED, CommTiled has its own method
|
|
return owning proc ID via grid2proc
|
|
return igx,igy,igz = logical grid loc of owing proc within 3d grid of procs
|
|
------------------------------------------------------------------------- */
|
|
|
|
int Comm::coord2proc(double *x, int &igx, int &igy, int &igz)
|
|
{
|
|
double *prd = domain->prd;
|
|
double *boxlo = domain->boxlo;
|
|
|
|
// initialize triclinic b/c coord2proc can be called before Comm::init()
|
|
// via Irregular::migrate_atoms()
|
|
|
|
triclinic = domain->triclinic;
|
|
|
|
if (layout == Comm::LAYOUT_UNIFORM) {
|
|
if (triclinic == 0) {
|
|
igx = static_cast<int> (procgrid[0] * (x[0]-boxlo[0]) / prd[0]);
|
|
igy = static_cast<int> (procgrid[1] * (x[1]-boxlo[1]) / prd[1]);
|
|
igz = static_cast<int> (procgrid[2] * (x[2]-boxlo[2]) / prd[2]);
|
|
} else {
|
|
igx = static_cast<int> (procgrid[0] * x[0]);
|
|
igy = static_cast<int> (procgrid[1] * x[1]);
|
|
igz = static_cast<int> (procgrid[2] * x[2]);
|
|
}
|
|
|
|
} else if (layout == Comm::LAYOUT_NONUNIFORM) {
|
|
if (triclinic == 0) {
|
|
igx = utils::binary_search((x[0]-boxlo[0])/prd[0],procgrid[0],xsplit);
|
|
igy = utils::binary_search((x[1]-boxlo[1])/prd[1],procgrid[1],ysplit);
|
|
igz = utils::binary_search((x[2]-boxlo[2])/prd[2],procgrid[2],zsplit);
|
|
} else {
|
|
igx = utils::binary_search(x[0],procgrid[0],xsplit);
|
|
igy = utils::binary_search(x[1],procgrid[1],ysplit);
|
|
igz = utils::binary_search(x[2],procgrid[2],zsplit);
|
|
}
|
|
}
|
|
|
|
if (igx < 0) igx = 0;
|
|
if (igx >= procgrid[0]) igx = procgrid[0] - 1;
|
|
if (igy < 0) igy = 0;
|
|
if (igy >= procgrid[1]) igy = procgrid[1] - 1;
|
|
if (igz < 0) igz = 0;
|
|
if (igz >= procgrid[2]) igz = procgrid[2] - 1;
|
|
|
|
return grid2proc[igx][igy][igz];
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
partition a global regular grid into one brick-shaped sub-grid per proc
|
|
if grid point is inside my sub-domain I own it,
|
|
this includes sub-domain lo boundary but excludes hi boundary
|
|
nx,ny,nz = extent of global grid
|
|
indices into the global grid range from 0 to N-1 in each dim
|
|
zfactor = 0.0 if the grid exactly covers the simulation box
|
|
zfactor > 1.0 if the grid extends beyond the +z boundary by this factor
|
|
used by 2d slab-mode PPPM
|
|
this effectively maps proc sub-grids to a smaller subset of the grid
|
|
nxyz lo/hi = inclusive lo/hi bounds of global grid sub-brick I own
|
|
if proc owns no grid cells in a dim, then nlo > nhi
|
|
special case: 2 procs share boundary which a grid point is exactly on
|
|
2 equality if tests insure a consistent decision as to which proc owns it
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::partition_grid(int nx, int ny, int nz, double zfactor,
|
|
int &nxlo, int &nxhi, int &nylo, int &nyhi,
|
|
int &nzlo, int &nzhi)
|
|
{
|
|
double xfraclo,xfrachi,yfraclo,yfrachi,zfraclo,zfrachi;
|
|
|
|
if (layout != LAYOUT_TILED) {
|
|
xfraclo = xsplit[myloc[0]];
|
|
xfrachi = xsplit[myloc[0]+1];
|
|
yfraclo = ysplit[myloc[1]];
|
|
yfrachi = ysplit[myloc[1]+1];
|
|
zfraclo = zsplit[myloc[2]];
|
|
zfrachi = zsplit[myloc[2]+1];
|
|
} else {
|
|
xfraclo = mysplit[0][0];
|
|
xfrachi = mysplit[0][1];
|
|
yfraclo = mysplit[1][0];
|
|
yfrachi = mysplit[1][1];
|
|
zfraclo = mysplit[2][0];
|
|
zfrachi = mysplit[2][1];
|
|
}
|
|
|
|
nxlo = static_cast<int> (xfraclo * nx);
|
|
if (1.0*nxlo != xfraclo*nx) nxlo++;
|
|
nxhi = static_cast<int> (xfrachi * nx);
|
|
if (1.0*nxhi == xfrachi*nx) nxhi--;
|
|
|
|
nylo = static_cast<int> (yfraclo * ny);
|
|
if (1.0*nylo != yfraclo*ny) nylo++;
|
|
nyhi = static_cast<int> (yfrachi * ny);
|
|
if (1.0*nyhi == yfrachi*ny) nyhi--;
|
|
|
|
if (zfactor == 0.0) {
|
|
nzlo = static_cast<int> (zfraclo * nz);
|
|
if (1.0*nzlo != zfraclo*nz) nzlo++;
|
|
nzhi = static_cast<int> (zfrachi * nz);
|
|
if (1.0*nzhi == zfrachi*nz) nzhi--;
|
|
} else {
|
|
nzlo = static_cast<int> (zfraclo * nz/zfactor);
|
|
if (1.0*nzlo != zfraclo*nz) nzlo++;
|
|
nzhi = static_cast<int> (zfrachi * nz/zfactor);
|
|
if (1.0*nzhi == zfrachi*nz) nzhi--;
|
|
}
|
|
|
|
// OLD code
|
|
// could sometimes map grid points slightly outside a proc to the proc
|
|
|
|
/*
|
|
if (layout != LAYOUT_TILED) {
|
|
nxlo = static_cast<int> (xsplit[myloc[0]] * nx);
|
|
nxhi = static_cast<int> (xsplit[myloc[0]+1] * nx) - 1;
|
|
|
|
nylo = static_cast<int> (ysplit[myloc[1]] * ny);
|
|
nyhi = static_cast<int> (ysplit[myloc[1]+1] * ny) - 1;
|
|
|
|
if (zfactor == 0.0) {
|
|
nzlo = static_cast<int> (zsplit[myloc[2]] * nz);
|
|
nzhi = static_cast<int> (zsplit[myloc[2]+1] * nz) - 1;
|
|
} else {
|
|
nzlo = static_cast<int> (zsplit[myloc[2]] * nz/zfactor);
|
|
nzhi = static_cast<int> (zsplit[myloc[2]+1] * nz/zfactor) - 1;
|
|
}
|
|
|
|
} else {
|
|
nxlo = static_cast<int> (mysplit[0][0] * nx);
|
|
nxhi = static_cast<int> (mysplit[0][1] * nx) - 1;
|
|
|
|
nylo = static_cast<int> (mysplit[1][0] * ny);
|
|
nyhi = static_cast<int> (mysplit[1][1] * ny) - 1;
|
|
|
|
if (zfactor == 0.0) {
|
|
nzlo = static_cast<int> (mysplit[2][0] * nz);
|
|
nzhi = static_cast<int> (mysplit[2][1] * nz) - 1;
|
|
} else {
|
|
nzlo = static_cast<int> (mysplit[2][0] * nz/zfactor);
|
|
nzhi = static_cast<int> (mysplit[2][1] * nz/zfactor) - 1;
|
|
}
|
|
}
|
|
*/
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
communicate inbuf around full ring of processors with messtag
|
|
nbytes = size of inbuf = n datums * nper bytes
|
|
callback() is invoked to allow caller to process/update each proc's inbuf
|
|
if self=1 (default), then callback() is invoked on final iteration
|
|
using original inbuf, which may have been updated
|
|
for non-nullptr outbuf, final updated inbuf is copied to it
|
|
ok to specify outbuf = inbuf
|
|
the ptr argument is a pointer to the instance of calling class
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::ring(int n, int nper, void *inbuf, int messtag,
|
|
void (*callback)(int, char *, void *),
|
|
void *outbuf, void *ptr, int self)
|
|
{
|
|
MPI_Request request;
|
|
MPI_Status status;
|
|
|
|
int nbytes = n*nper;
|
|
int maxbytes;
|
|
MPI_Allreduce(&nbytes,&maxbytes,1,MPI_INT,MPI_MAX,world);
|
|
|
|
// no need to communicate without data
|
|
|
|
if (maxbytes == 0) return;
|
|
|
|
// sanity check
|
|
|
|
if ((nbytes > 0) && inbuf == nullptr)
|
|
error->one(FLERR,"Cannot put data on ring from NULL pointer");
|
|
|
|
char *buf,*bufcopy;
|
|
memory->create(buf,maxbytes,"comm:buf");
|
|
memory->create(bufcopy,maxbytes,"comm:bufcopy");
|
|
if (nbytes && inbuf) memcpy(buf,inbuf,nbytes);
|
|
|
|
int next = me + 1;
|
|
int prev = me - 1;
|
|
if (next == nprocs) next = 0;
|
|
if (prev < 0) prev = nprocs - 1;
|
|
|
|
for (int loop = 0; loop < nprocs; loop++) {
|
|
if (me != next) {
|
|
MPI_Irecv(bufcopy,maxbytes,MPI_CHAR,prev,messtag,world,&request);
|
|
MPI_Send(buf,nbytes,MPI_CHAR,next,messtag,world);
|
|
MPI_Wait(&request,&status);
|
|
MPI_Get_count(&status,MPI_CHAR,&nbytes);
|
|
if (nbytes) memcpy(buf,bufcopy,nbytes);
|
|
}
|
|
if (self || loop < nprocs-1) callback(nbytes/nper,buf,ptr);
|
|
}
|
|
|
|
if (nbytes && outbuf) memcpy(outbuf,buf,nbytes);
|
|
|
|
memory->destroy(buf);
|
|
memory->destroy(bufcopy);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
rendezvous communication operation
|
|
three stages:
|
|
first comm sends inbuf from caller decomp to rvous decomp
|
|
callback operates on data in rendezvous decomp
|
|
second comm sends outbuf from rvous decomp back to caller decomp
|
|
inputs:
|
|
which = perform (0) irregular or (1) MPI_All2allv communication
|
|
n = # of datums in inbuf
|
|
inbuf = vector of input datums
|
|
insize = byte size of each input datum
|
|
inorder = 0 for inbuf in random proc order, 1 for datums ordered by proc
|
|
procs: inorder 0 = proc to send each datum to, 1 = # of datums/proc,
|
|
callback = caller function to invoke in rendezvous decomposition
|
|
takes input datums, returns output datums
|
|
outorder = same as inorder, but for datums returned by callback()
|
|
ptr = pointer to caller class, passed to callback()
|
|
outputs:
|
|
nout = # of output datums (function return)
|
|
outbuf = vector of output datums
|
|
outsize = byte size of each output datum
|
|
callback inputs:
|
|
nrvous = # of rvous decomp datums in inbuf_rvous
|
|
inbuf_rvous = vector of rvous decomp input datums
|
|
ptr = pointer to caller class
|
|
callback outputs:
|
|
nrvous_out = # of rvous decomp output datums (function return)
|
|
flag = 0 for no second comm, 1 for outbuf_rvous = inbuf_rvous,
|
|
2 for second comm with new outbuf_rvous
|
|
procs_rvous = outorder 0 = proc to send each datum to, 1 = # of datums/proc
|
|
allocated
|
|
outbuf_rvous = vector of rvous decomp output datums
|
|
NOTE: could use MPI_INT or MPI_DOUBLE insead of MPI_CHAR
|
|
to avoid checked-for overflow in MPI_Alltoallv?
|
|
------------------------------------------------------------------------- */
|
|
|
|
int Comm::
|
|
rendezvous(int which, int n, char *inbuf, int insize,
|
|
int inorder, int *procs,
|
|
int (*callback)(int, char *, int &, int *&, char *&, void *),
|
|
int outorder, char *&outbuf, int outsize, void *ptr, int statflag)
|
|
{
|
|
if (which == 0)
|
|
return rendezvous_irregular(n,inbuf,insize,inorder,procs,callback,
|
|
outorder,outbuf,outsize,ptr,statflag);
|
|
else
|
|
return rendezvous_all2all(n,inbuf,insize,inorder,procs,callback,
|
|
outorder,outbuf,outsize,ptr,statflag);
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
int Comm::
|
|
rendezvous_irregular(int n, char *inbuf, int insize, int inorder, int *procs,
|
|
int (*callback)(int, char *, int &, int *&, char *&, void *),
|
|
int outorder, char *&outbuf,
|
|
int outsize, void *ptr, int statflag)
|
|
{
|
|
// irregular comm of inbuf from caller decomp to rendezvous decomp
|
|
|
|
auto irregular = new Irregular(lmp);
|
|
|
|
int nrvous;
|
|
if (inorder) nrvous = irregular->create_data_grouped(n,procs);
|
|
else nrvous = irregular->create_data(n,procs);
|
|
|
|
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
|
|
|
|
auto inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize+1, "rendezvous:inbuf");
|
|
irregular->exchange_data(inbuf,insize,inbuf_rvous);
|
|
|
|
bigint irregular1_bytes = irregular->memory_usage();
|
|
irregular->destroy_data();
|
|
delete irregular;
|
|
|
|
// peform rendezvous computation via callback()
|
|
// callback() allocates/populates proclist_rvous and outbuf_rvous
|
|
|
|
int flag;
|
|
int *procs_rvous;
|
|
char *outbuf_rvous;
|
|
int nrvous_out = callback(nrvous,inbuf_rvous,flag, procs_rvous,outbuf_rvous,ptr);
|
|
|
|
if (flag != 1) memory->sfree(inbuf_rvous); // outbuf_rvous = inbuf_vous
|
|
if (flag == 0) {
|
|
if (statflag) rendezvous_stats(n,0,nrvous,nrvous_out,insize,outsize,
|
|
(bigint) nrvous_out*sizeof(int) + irregular1_bytes);
|
|
return 0; // all nout_rvous are 0, no 2nd comm stage
|
|
}
|
|
|
|
// irregular comm of outbuf from rendezvous decomp back to caller decomp
|
|
// caller will free outbuf
|
|
|
|
irregular = new Irregular(lmp);
|
|
|
|
int nout;
|
|
if (outorder) nout = irregular->create_data_grouped(nrvous_out,procs_rvous);
|
|
else nout = irregular->create_data(nrvous_out,procs_rvous);
|
|
|
|
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
|
|
|
|
outbuf = (char *) memory->smalloc((bigint) nout*outsize+1, "rendezvous:outbuf");
|
|
irregular->exchange_data(outbuf_rvous,outsize,outbuf);
|
|
|
|
bigint irregular2_bytes = irregular->memory_usage();
|
|
irregular->destroy_data();
|
|
delete irregular;
|
|
|
|
memory->destroy(procs_rvous);
|
|
memory->sfree(outbuf_rvous);
|
|
|
|
// return number of output datums
|
|
// last arg to stats() = memory for procs_rvous + irregular comm
|
|
|
|
if (statflag) rendezvous_stats(n,nout,nrvous,nrvous_out,insize,outsize,
|
|
(bigint) nrvous_out*sizeof(int) +
|
|
MAX(irregular1_bytes,irregular2_bytes));
|
|
return nout;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
int Comm::
|
|
rendezvous_all2all(int n, char *inbuf, int insize, int inorder, int *procs,
|
|
int (*callback)(int, char *, int &, int *&, char *&, void *),
|
|
int outorder, char *&outbuf, int outsize, void *ptr,
|
|
int statflag)
|
|
{
|
|
int iproc;
|
|
bigint all2all1_bytes,all2all2_bytes;
|
|
int *sendcount,*sdispls,*recvcount,*rdispls;
|
|
int *procs_a2a;
|
|
bigint *offsets;
|
|
char *inbuf_a2a,*outbuf_a2a;
|
|
|
|
// create procs and inbuf for All2all if necessary
|
|
|
|
if (!inorder) {
|
|
memory->create(procs_a2a,nprocs,"rendezvous:procs");
|
|
|
|
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
|
|
|
|
inbuf_a2a = (char *) memory->smalloc((bigint) n*insize+1,
|
|
"rendezvous:inbuf");
|
|
memset(inbuf_a2a,0,(bigint)n*insize*sizeof(char));
|
|
memory->create(offsets,nprocs,"rendezvous:offsets");
|
|
|
|
for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
|
|
for (int i = 0; i < n; i++) procs_a2a[procs[i]]++;
|
|
|
|
offsets[0] = 0;
|
|
for (int i = 1; i < nprocs; i++)
|
|
offsets[i] = offsets[i-1] + (bigint)insize*procs_a2a[i-1];
|
|
|
|
bigint offset = 0;
|
|
for (int i = 0; i < n; i++) {
|
|
iproc = procs[i];
|
|
memcpy(&inbuf_a2a[offsets[iproc]],&inbuf[offset],insize);
|
|
offsets[iproc] += insize;
|
|
offset += insize;
|
|
}
|
|
|
|
all2all1_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint)
|
|
+ (bigint)n*insize;
|
|
|
|
} else {
|
|
procs_a2a = procs;
|
|
inbuf_a2a = inbuf;
|
|
all2all1_bytes = 0;
|
|
}
|
|
|
|
// create args for MPI_Alltoallv() on input data
|
|
|
|
memory->create(sendcount,nprocs,"rendezvous:sendcount");
|
|
memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
|
|
|
|
memory->create(recvcount,nprocs,"rendezvous:recvcount");
|
|
MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);
|
|
|
|
memory->create(sdispls,nprocs,"rendezvous:sdispls");
|
|
memory->create(rdispls,nprocs,"rendezvous:rdispls");
|
|
sdispls[0] = rdispls[0] = 0;
|
|
for (int i = 1; i < nprocs; i++) {
|
|
sdispls[i] = sdispls[i-1] + sendcount[i-1];
|
|
rdispls[i] = rdispls[i-1] + recvcount[i-1];
|
|
}
|
|
int nrvous = rdispls[nprocs-1] + recvcount[nprocs-1];
|
|
|
|
// test for overflow of input data due to imbalance or insize
|
|
// means that individual sdispls or rdispls values overflow
|
|
|
|
int overflow = 0;
|
|
if ((bigint) n*insize > MAXSMALLINT) overflow = 1;
|
|
if ((bigint) nrvous*insize > MAXSMALLINT) overflow = 1;
|
|
int overflowall;
|
|
MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
|
|
if (overflowall) error->all(FLERR,"Overflow input size in rendezvous_a2a");
|
|
|
|
for (int i = 0; i < nprocs; i++) {
|
|
sendcount[i] *= insize;
|
|
sdispls[i] *= insize;
|
|
recvcount[i] *= insize;
|
|
rdispls[i] *= insize;
|
|
}
|
|
|
|
// all2all comm of inbuf from caller decomp to rendezvous decomp
|
|
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
|
|
|
|
auto inbuf_rvous = (char *) memory->smalloc((bigint) nrvous*insize+1, "rendezvous:inbuf");
|
|
memset(inbuf_rvous,0,(bigint) nrvous*insize*sizeof(char));
|
|
|
|
MPI_Alltoallv(inbuf_a2a,sendcount,sdispls,MPI_CHAR,
|
|
inbuf_rvous,recvcount,rdispls,MPI_CHAR,world);
|
|
|
|
if (!inorder) {
|
|
memory->destroy(procs_a2a);
|
|
memory->sfree(inbuf_a2a);
|
|
memory->destroy(offsets);
|
|
}
|
|
|
|
// peform rendezvous computation via callback()
|
|
// callback() allocates/populates proclist_rvous and outbuf_rvous
|
|
|
|
int flag;
|
|
int *procs_rvous;
|
|
char *outbuf_rvous;
|
|
|
|
int nrvous_out = callback(nrvous,inbuf_rvous,flag, procs_rvous,outbuf_rvous,ptr);
|
|
|
|
if (flag != 1) memory->sfree(inbuf_rvous); // outbuf_rvous = inbuf_vous
|
|
if (flag == 0) {
|
|
memory->destroy(sendcount);
|
|
memory->destroy(recvcount);
|
|
memory->destroy(sdispls);
|
|
memory->destroy(rdispls);
|
|
if (statflag) rendezvous_stats(n,0,nrvous,nrvous_out,insize,outsize,
|
|
(bigint) nrvous_out*sizeof(int) +
|
|
4*nprocs*sizeof(int) + all2all1_bytes);
|
|
return 0; // all nout_rvous are 0, no 2nd irregular
|
|
}
|
|
|
|
// create procs and outbuf for All2all if necessary
|
|
|
|
if (!outorder) {
|
|
memory->create(procs_a2a,nprocs,"rendezvous_a2a:procs");
|
|
|
|
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
|
|
|
|
outbuf_a2a = (char *) memory->smalloc((bigint) nrvous_out*outsize+1, "rendezvous:outbuf");
|
|
memory->create(offsets,nprocs,"rendezvous:offsets");
|
|
|
|
for (int i = 0; i < nprocs; i++) procs_a2a[i] = 0;
|
|
for (int i = 0; i < nrvous_out; i++) procs_a2a[procs_rvous[i]]++;
|
|
|
|
offsets[0] = 0;
|
|
for (int i = 1; i < nprocs; i++)
|
|
offsets[i] = offsets[i-1] + (bigint)outsize*procs_a2a[i-1];
|
|
|
|
bigint offset = 0;
|
|
for (int i = 0; i < nrvous_out; i++) {
|
|
iproc = procs_rvous[i];
|
|
memcpy(&outbuf_a2a[offsets[iproc]],&outbuf_rvous[offset],outsize);
|
|
offsets[iproc] += outsize;
|
|
offset += outsize;
|
|
}
|
|
|
|
all2all2_bytes = nprocs*sizeof(int) + nprocs*sizeof(bigint) + (bigint)nrvous_out*outsize;
|
|
|
|
} else {
|
|
procs_a2a = procs_rvous;
|
|
outbuf_a2a = outbuf_rvous;
|
|
all2all2_bytes = 0;
|
|
}
|
|
|
|
// comm outbuf from rendezvous decomposition back to caller
|
|
|
|
memcpy(sendcount,procs_a2a,nprocs*sizeof(int));
|
|
|
|
MPI_Alltoall(sendcount,1,MPI_INT,recvcount,1,MPI_INT,world);
|
|
|
|
sdispls[0] = rdispls[0] = 0;
|
|
for (int i = 1; i < nprocs; i++) {
|
|
sdispls[i] = sdispls[i-1] + sendcount[i-1];
|
|
rdispls[i] = rdispls[i-1] + recvcount[i-1];
|
|
}
|
|
int nout = rdispls[nprocs-1] + recvcount[nprocs-1];
|
|
|
|
// test for overflow of outbuf due to imbalance or outsize
|
|
// means that individual sdispls or rdispls values overflow
|
|
|
|
overflow = 0;
|
|
if ((bigint) nrvous*outsize > MAXSMALLINT) overflow = 1;
|
|
if ((bigint) nout*outsize > MAXSMALLINT) overflow = 1;
|
|
MPI_Allreduce(&overflow,&overflowall,1,MPI_INT,MPI_MAX,world);
|
|
if (overflowall) error->all(FLERR,"Overflow output in rendezvous_a2a");
|
|
|
|
for (int i = 0; i < nprocs; i++) {
|
|
sendcount[i] *= outsize;
|
|
sdispls[i] *= outsize;
|
|
recvcount[i] *= outsize;
|
|
rdispls[i] *= outsize;
|
|
}
|
|
|
|
// all2all comm of outbuf from rendezvous decomp back to caller decomp
|
|
// caller will free outbuf
|
|
// add 1 item to the allocated buffer size, so the returned pointer is not a null pointer
|
|
|
|
outbuf = (char *) memory->smalloc((bigint) nout*outsize+1,"rendezvous:outbuf");
|
|
|
|
MPI_Alltoallv(outbuf_a2a,sendcount,sdispls,MPI_CHAR,
|
|
outbuf,recvcount,rdispls,MPI_CHAR,world);
|
|
|
|
memory->destroy(procs_rvous);
|
|
memory->sfree(outbuf_rvous);
|
|
|
|
if (!outorder) {
|
|
memory->destroy(procs_a2a);
|
|
memory->sfree(outbuf_a2a);
|
|
memory->destroy(offsets);
|
|
}
|
|
|
|
// clean up
|
|
|
|
memory->destroy(sendcount);
|
|
memory->destroy(recvcount);
|
|
memory->destroy(sdispls);
|
|
memory->destroy(rdispls);
|
|
|
|
// return number of output datums
|
|
// last arg to stats() = mem for procs_rvous + per-proc vecs + reordering ops
|
|
|
|
if (statflag) rendezvous_stats(n,nout,nrvous,nrvous_out,insize,outsize,
|
|
(bigint) nrvous_out*sizeof(int) +
|
|
4*nprocs*sizeof(int) +
|
|
MAX(all2all1_bytes,all2all2_bytes));
|
|
return nout;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
print balance and memory info for rendezvous operation
|
|
useful for debugging
|
|
------------------------------------------------------------------------- */
|
|
|
|
void Comm::rendezvous_stats(int n, int nout, int nrvous, int nrvous_out,
|
|
int insize, int outsize, bigint commsize)
|
|
{
|
|
bigint size_in_all,size_in_max,size_in_min;
|
|
bigint size_out_all,size_out_max,size_out_min;
|
|
bigint size_inrvous_all,size_inrvous_max,size_inrvous_min;
|
|
bigint size_outrvous_all,size_outrvous_max,size_outrvous_min;
|
|
bigint size_comm_all,size_comm_max,size_comm_min;
|
|
|
|
bigint size = (bigint) n*insize;
|
|
MPI_Allreduce(&size,&size_in_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
|
|
MPI_Allreduce(&size,&size_in_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
|
|
MPI_Allreduce(&size,&size_in_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
|
|
|
|
size = (bigint) nout*outsize;
|
|
MPI_Allreduce(&size,&size_out_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
|
|
MPI_Allreduce(&size,&size_out_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
|
|
MPI_Allreduce(&size,&size_out_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
|
|
|
|
size = (bigint) nrvous*insize;
|
|
MPI_Allreduce(&size,&size_inrvous_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
|
|
MPI_Allreduce(&size,&size_inrvous_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
|
|
MPI_Allreduce(&size,&size_inrvous_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
|
|
|
|
size = (bigint) nrvous_out*insize;
|
|
MPI_Allreduce(&size,&size_outrvous_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
|
|
MPI_Allreduce(&size,&size_outrvous_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
|
|
MPI_Allreduce(&size,&size_outrvous_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
|
|
|
|
size = commsize;
|
|
MPI_Allreduce(&size,&size_comm_all,1,MPI_LMP_BIGINT,MPI_SUM,world);
|
|
MPI_Allreduce(&size,&size_comm_max,1,MPI_LMP_BIGINT,MPI_MAX,world);
|
|
MPI_Allreduce(&size,&size_comm_min,1,MPI_LMP_BIGINT,MPI_MIN,world);
|
|
|
|
int mbytes = 1024*1024;
|
|
|
|
if (me == 0) {
|
|
std::string mesg = "Rendezvous balance and memory info: (tot,ave,max,min) \n";
|
|
mesg += fmt::format(" input datum count: {} {} {} {}\n",
|
|
size_in_all/insize,1.0*size_in_all/nprocs/insize,
|
|
size_in_max/insize,size_in_min/insize);
|
|
mesg += fmt::format(" input data (MB): {:.6} {:.6} {:.6} {:.6}\n",
|
|
1.0*size_in_all/mbytes,1.0*size_in_all/nprocs/mbytes,
|
|
1.0*size_in_max/mbytes,1.0*size_in_min/mbytes);
|
|
if (outsize)
|
|
mesg += fmt::format(" output datum count: {} {} {} {}\n",
|
|
size_out_all/outsize,1.0*size_out_all/nprocs/outsize,
|
|
size_out_max/outsize,size_out_min/outsize);
|
|
else
|
|
mesg += fmt::format(" output datum count: {} {:.6} {} {}\n",0,0.0,0,0);
|
|
|
|
mesg += fmt::format(" output data (MB): {:.6} {:.6} {:.6} {:.6}\n",
|
|
1.0*size_out_all/mbytes,1.0*size_out_all/nprocs/mbytes,
|
|
1.0*size_out_max/mbytes,1.0*size_out_min/mbytes);
|
|
mesg += fmt::format(" input rvous datum count: {} {} {} {}\n",
|
|
size_inrvous_all/insize,1.0*size_inrvous_all/nprocs/insize,
|
|
size_inrvous_max/insize,size_inrvous_min/insize);
|
|
mesg += fmt::format(" input rvous data (MB): {:.6} {:.6} {:.6} {:.6}\n",
|
|
1.0*size_inrvous_all/mbytes,1.0*size_inrvous_all/nprocs/mbytes,
|
|
1.0*size_inrvous_max/mbytes,1.0*size_inrvous_min/mbytes);
|
|
if (outsize)
|
|
mesg += fmt::format(" output rvous datum count: {} {} {} {}\n",
|
|
size_outrvous_all/outsize,1.0*size_outrvous_all/nprocs/outsize,
|
|
size_outrvous_max/outsize,size_outrvous_min/outsize);
|
|
else
|
|
mesg += fmt::format(" output rvous datum count: {} {:.6} {} {}\n",0,0.0,0,0);
|
|
mesg += fmt::format(" output rvous data (MB): {:.6} {:.6} {:.6} {:.6}\n",
|
|
1.0*size_outrvous_all/mbytes,1.0*size_outrvous_all/nprocs/mbytes,
|
|
1.0*size_outrvous_max/mbytes,1.0*size_outrvous_min/mbytes);
|
|
mesg += fmt::format(" rvous comm (MB): {:.6} {:.6} {:.6} {:.6}\n",
|
|
1.0*size_comm_all/mbytes,1.0*size_comm_all/nprocs/mbytes,
|
|
1.0*size_comm_max/mbytes,1.0*size_comm_min/mbytes);
|
|
utils::logmesg(lmp,mesg);
|
|
}
|
|
}
|