Add option to specify # of numa domains for processors grid numa style, simplify numa mapping with more agressive optimization to reduce comm between numa domains.
This commit is contained in:
@ -25,6 +25,8 @@ Syntax
|
||||
*numa* params = none
|
||||
*custom* params = infile
|
||||
infile = file containing grid layout
|
||||
*numa_nodes* arg = Nn
|
||||
Nn = number of numa domains per node
|
||||
*map* arg = *cart* or *cart/reorder* or *xyz* or *xzy* or *yxz* or *yzx* or *zxy* or *zyx*
|
||||
cart = use MPI_Cart() methods to map processors to 3d grid with reorder = 0
|
||||
cart/reorder = use MPI_Cart() methods to map processors to 3d grid with reorder = 1
|
||||
@ -159,24 +161,28 @@ surface-to-volume ratio of each processor's subdomain.
|
||||
|
||||
The *numa* style operates similar to the *twolevel* keyword except
|
||||
that it auto-detects which cores are running on which nodes.
|
||||
Currently, it does this in only 2 levels, but it may be extended in
|
||||
the future to account for socket topology and other non-uniform memory
|
||||
access (NUMA) costs. It also uses a different algorithm than the
|
||||
*twolevel* keyword for doing the two-level factorization of the
|
||||
simulation box into a 3d processor grid to minimize off-node
|
||||
communication, and it does its own MPI-based mapping of nodes and
|
||||
It will also subdivide the cores into numa domains. Currently, the
|
||||
number of numa domains is not autodetected and must be specified using
|
||||
the *numa_nodes* keyword; otherwise, the default value is used. The
|
||||
*numa* style uses a different algorithm than the *twolevel* keyword for
|
||||
doing the two-level factorization of the simulation box into a 3d
|
||||
processor grid to minimize off-node communication and communication
|
||||
across numa domains. It does its own MPI-based mapping of nodes and
|
||||
cores to the regular 3d grid. Thus it may produce a different layout
|
||||
of the processors than the *twolevel* options.
|
||||
|
||||
The *numa* style will give an error if the number of MPI processes is
|
||||
not divisible by the number of cores used per node, or any of the Px
|
||||
or Py of Pz values is greater than 1.
|
||||
or Py or Pz values is greater than 1.
|
||||
|
||||
.. note::
|
||||
|
||||
Unlike the *twolevel* style, the *numa* style does not require
|
||||
any particular ordering of MPI ranks i norder to work correctly. This
|
||||
any particular ordering of MPI ranks in order to work correctly. This
|
||||
is because it auto-detects which processes are running on which nodes.
|
||||
However, it assumes that the lowest ranks are in the first numa
|
||||
domain, and so forth. MPI rank orderings that do not preserve this
|
||||
property might result in more intranode communication between CPUs.
|
||||
|
||||
The *custom* style uses the file *infile* to define both the 3d
|
||||
factorization and the mapping of processors to the grid.
|
||||
@ -207,6 +213,14 @@ any order, but no processor ID should appear more than once.
|
||||
|
||||
----------
|
||||
|
||||
The *numa_nodes* keyword is used to specifiy the number of numa domains
|
||||
per node. It is currently only used by the *numa* style for two-level
|
||||
factorization to reduce the amount of MPI communications between CPUs.
|
||||
A good setting for this will typically be equal to the number of CPU
|
||||
sockets per node.
|
||||
|
||||
----------
|
||||
|
||||
The *map* keyword affects how the P processor IDs (from 0 to P-1) are
|
||||
mapped to the 3d grid of processors. It is only used by the
|
||||
*onelevel* and *twolevel* grid settings.
|
||||
@ -356,5 +370,5 @@ Related commands
|
||||
Default
|
||||
"""""""
|
||||
|
||||
The option defaults are Px Py Pz = \* \* \*, grid = onelevel, and map =
|
||||
cart.
|
||||
The option defaults are Px Py Pz = \* \* \*, grid = onelevel, map =
|
||||
cart, and numa_nodes = 2.
|
||||
|
||||
@ -420,6 +420,7 @@ void Comm::set_processors(int narg, char **arg)
|
||||
error->all(FLERR,"Specified processors != physical processors");
|
||||
|
||||
int iarg = 3;
|
||||
numa_nodes = 2;
|
||||
while (iarg < narg) {
|
||||
if (strcmp(arg[iarg],"grid") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
|
||||
@ -514,6 +515,12 @@ void Comm::set_processors(int narg, char **arg)
|
||||
outfile = utils::strdup(arg[iarg+1]);
|
||||
iarg += 2;
|
||||
|
||||
} else if (strcmp(arg[iarg],"numa_nodes") == 0) {
|
||||
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
|
||||
numa_nodes = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
|
||||
if (numa_nodes < 1) numa_nodes = 2;
|
||||
iarg += 2;
|
||||
|
||||
} else error->all(FLERR,"Illegal processors command");
|
||||
}
|
||||
|
||||
@ -565,7 +572,7 @@ void Comm::set_proc_grid(int outflag)
|
||||
otherflag,other_style,other_procgrid,other_coregrid);
|
||||
|
||||
} else if (gridflag == NUMA) {
|
||||
pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
|
||||
pmap->numa_grid(numa_nodes,nprocs,user_procgrid,procgrid,coregrid);
|
||||
|
||||
} else if (gridflag == CUSTOM) {
|
||||
pmap->custom_grid(customfile,nprocs,user_procgrid,procgrid);
|
||||
|
||||
@ -146,6 +146,7 @@ class Comm : protected Pointers {
|
||||
char xyz[4]; // xyz mapping of procs to 3d grid
|
||||
char *customfile; // file with custom proc map
|
||||
char *outfile; // proc grid/map output file
|
||||
int numa_nodes; // Number of numa domains per socket for 3d grid
|
||||
|
||||
int otherflag; // 1 if this partition dependent on another
|
||||
int other_style; // style of dependency
|
||||
|
||||
@ -150,13 +150,9 @@ void ProcMap::twolevel_grid(int nprocs, int *user_procgrid, int *procgrid,
|
||||
auto-detects NUMA sockets within a multi-core node
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
|
||||
int *numagrid)
|
||||
void ProcMap::numa_grid(int numa_nodes, int nprocs, int *user_procgrid,
|
||||
int *procgrid, int *numagrid)
|
||||
{
|
||||
// hardwire this for now
|
||||
|
||||
int numa_nodes = 1;
|
||||
|
||||
// get names of all nodes
|
||||
|
||||
int name_length;
|
||||
@ -181,6 +177,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
|
||||
}
|
||||
procs_per_node = name_map.begin()->second;
|
||||
procs_per_numa = procs_per_node / numa_nodes;
|
||||
if (procs_per_numa < 1) procs_per_numa = 1;
|
||||
|
||||
delete [] node_names;
|
||||
|
||||
@ -192,6 +189,24 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
|
||||
user_procgrid[2] > 1)
|
||||
error->all(FLERR,"Could not create numa grid of processors");
|
||||
|
||||
// factorization for the grid of NUMA nodes
|
||||
|
||||
int node_count = nprocs / procs_per_numa;
|
||||
|
||||
int **nodefactors;
|
||||
int nodepossible = factor(node_count,nullptr);
|
||||
memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
|
||||
nodepossible = factor(node_count,nodefactors);
|
||||
|
||||
if (domain->dimension == 2)
|
||||
nodepossible = cull_2d(nodepossible,nodefactors,3);
|
||||
nodepossible = cull_user(nodepossible,nodefactors,3,user_procgrid);
|
||||
|
||||
if (nodepossible == 0)
|
||||
error->all(FLERR,"Could not create numa grid of processors");
|
||||
|
||||
best_factors(nodepossible,nodefactors,nodegrid,1,1,1);
|
||||
|
||||
// user settings for the factorization per numa node
|
||||
// currently not user settable
|
||||
// if user specifies 1 for a proc grid dimension,
|
||||
@ -204,6 +219,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
|
||||
if (user_procgrid[1] == 1) user_numagrid[1] = 1;
|
||||
if (user_procgrid[2] == 1) user_numagrid[2] = 1;
|
||||
|
||||
// perform NUMA node factorization using subdomain sizes
|
||||
// initial factorization within NUMA node
|
||||
|
||||
int **numafactors;
|
||||
@ -218,38 +234,6 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
|
||||
if (numapossible == 0)
|
||||
error->all(FLERR,"Could not create numa grid of processors");
|
||||
|
||||
best_factors(numapossible,numafactors,numagrid,1,1,1);
|
||||
|
||||
// user_nodegrid = implied user constraints on nodes
|
||||
|
||||
int user_nodegrid[3];
|
||||
user_nodegrid[0] = user_procgrid[0] / numagrid[0];
|
||||
user_nodegrid[1] = user_procgrid[1] / numagrid[1];
|
||||
user_nodegrid[2] = user_procgrid[2] / numagrid[2];
|
||||
|
||||
// factorization for the grid of NUMA nodes
|
||||
|
||||
int node_count = nprocs / procs_per_numa;
|
||||
|
||||
int **nodefactors;
|
||||
int nodepossible = factor(node_count,nullptr);
|
||||
memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
|
||||
nodepossible = factor(node_count,nodefactors);
|
||||
|
||||
if (domain->dimension == 2)
|
||||
nodepossible = cull_2d(nodepossible,nodefactors,3);
|
||||
nodepossible = cull_user(nodepossible,nodefactors,3,user_nodegrid);
|
||||
|
||||
if (nodepossible == 0)
|
||||
error->all(FLERR,"Could not create numa grid of processors");
|
||||
|
||||
best_factors(nodepossible,nodefactors,nodegrid,
|
||||
numagrid[0],numagrid[1],numagrid[2]);
|
||||
|
||||
// repeat NUMA node factorization using subdomain sizes
|
||||
// refines the factorization if the user specified the node layout
|
||||
// NOTE: this will not re-enforce user-procgrid constraint will it?
|
||||
|
||||
best_factors(numapossible,numafactors,numagrid,
|
||||
nodegrid[0],nodegrid[1],nodegrid[2]);
|
||||
|
||||
@ -270,6 +254,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
|
||||
procgrid[0] = nodegrid[0] * numagrid[0];
|
||||
procgrid[1] = nodegrid[1] * numagrid[1];
|
||||
procgrid[2] = nodegrid[2] * numagrid[2];
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
|
||||
@ -24,7 +24,7 @@ class ProcMap : protected Pointers {
|
||||
|
||||
void onelevel_grid(int, int *, int *, int, int, int *, int *);
|
||||
void twolevel_grid(int, int *, int *, int, int *, int *, int, int, int *, int *);
|
||||
void numa_grid(int, int *, int *, int *);
|
||||
void numa_grid(int, int, int *, int *, int *);
|
||||
void custom_grid(char *, int, int *, int *);
|
||||
void cart_map(int, int *, int *, int[3][2], int ***);
|
||||
void cart_map(int, int *, int, int *, int *, int[3][2], int ***);
|
||||
|
||||
Reference in New Issue
Block a user