Add option to specify # of numa domains for processors grid numa style, simplify numa mapping with more agressive optimization to reduce comm between numa domains.

This commit is contained in:
W. Michael Brown
2024-03-07 07:07:16 -08:00
parent 554f53decb
commit 8ab2544f5b
5 changed files with 57 additions and 50 deletions

View File

@ -25,6 +25,8 @@ Syntax
*numa* params = none
*custom* params = infile
infile = file containing grid layout
*numa_nodes* arg = Nn
Nn = number of numa domains per node
*map* arg = *cart* or *cart/reorder* or *xyz* or *xzy* or *yxz* or *yzx* or *zxy* or *zyx*
cart = use MPI_Cart() methods to map processors to 3d grid with reorder = 0
cart/reorder = use MPI_Cart() methods to map processors to 3d grid with reorder = 1
@ -159,24 +161,28 @@ surface-to-volume ratio of each processor's subdomain.
The *numa* style operates similar to the *twolevel* keyword except
that it auto-detects which cores are running on which nodes.
Currently, it does this in only 2 levels, but it may be extended in
the future to account for socket topology and other non-uniform memory
access (NUMA) costs. It also uses a different algorithm than the
*twolevel* keyword for doing the two-level factorization of the
simulation box into a 3d processor grid to minimize off-node
communication, and it does its own MPI-based mapping of nodes and
It will also subdivide the cores into numa domains. Currently, the
number of numa domains is not autodetected and must be specified using
the *numa_nodes* keyword; otherwise, the default value is used. The
*numa* style uses a different algorithm than the *twolevel* keyword for
doing the two-level factorization of the simulation box into a 3d
processor grid to minimize off-node communication and communication
across numa domains. It does its own MPI-based mapping of nodes and
cores to the regular 3d grid. Thus it may produce a different layout
of the processors than the *twolevel* options.
The *numa* style will give an error if the number of MPI processes is
not divisible by the number of cores used per node, or any of the Px
or Py of Pz values is greater than 1.
or Py or Pz values is greater than 1.
.. note::
Unlike the *twolevel* style, the *numa* style does not require
any particular ordering of MPI ranks i norder to work correctly. This
any particular ordering of MPI ranks in order to work correctly. This
is because it auto-detects which processes are running on which nodes.
However, it assumes that the lowest ranks are in the first numa
domain, and so forth. MPI rank orderings that do not preserve this
property might result in more intranode communication between CPUs.
The *custom* style uses the file *infile* to define both the 3d
factorization and the mapping of processors to the grid.
@ -207,6 +213,14 @@ any order, but no processor ID should appear more than once.
----------
The *numa_nodes* keyword is used to specifiy the number of numa domains
per node. It is currently only used by the *numa* style for two-level
factorization to reduce the amount of MPI communications between CPUs.
A good setting for this will typically be equal to the number of CPU
sockets per node.
----------
The *map* keyword affects how the P processor IDs (from 0 to P-1) are
mapped to the 3d grid of processors. It is only used by the
*onelevel* and *twolevel* grid settings.
@ -356,5 +370,5 @@ Related commands
Default
"""""""
The option defaults are Px Py Pz = \* \* \*, grid = onelevel, and map =
cart.
The option defaults are Px Py Pz = \* \* \*, grid = onelevel, map =
cart, and numa_nodes = 2.

View File

@ -420,6 +420,7 @@ void Comm::set_processors(int narg, char **arg)
error->all(FLERR,"Specified processors != physical processors");
int iarg = 3;
numa_nodes = 2;
while (iarg < narg) {
if (strcmp(arg[iarg],"grid") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
@ -514,6 +515,12 @@ void Comm::set_processors(int narg, char **arg)
outfile = utils::strdup(arg[iarg+1]);
iarg += 2;
} else if (strcmp(arg[iarg],"numa_nodes") == 0) {
if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
numa_nodes = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
if (numa_nodes < 1) numa_nodes = 2;
iarg += 2;
} else error->all(FLERR,"Illegal processors command");
}
@ -565,7 +572,7 @@ void Comm::set_proc_grid(int outflag)
otherflag,other_style,other_procgrid,other_coregrid);
} else if (gridflag == NUMA) {
pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
pmap->numa_grid(numa_nodes,nprocs,user_procgrid,procgrid,coregrid);
} else if (gridflag == CUSTOM) {
pmap->custom_grid(customfile,nprocs,user_procgrid,procgrid);

View File

@ -146,6 +146,7 @@ class Comm : protected Pointers {
char xyz[4]; // xyz mapping of procs to 3d grid
char *customfile; // file with custom proc map
char *outfile; // proc grid/map output file
int numa_nodes; // Number of numa domains per socket for 3d grid
int otherflag; // 1 if this partition dependent on another
int other_style; // style of dependency

View File

@ -150,13 +150,9 @@ void ProcMap::twolevel_grid(int nprocs, int *user_procgrid, int *procgrid,
auto-detects NUMA sockets within a multi-core node
------------------------------------------------------------------------- */
void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
int *numagrid)
void ProcMap::numa_grid(int numa_nodes, int nprocs, int *user_procgrid,
int *procgrid, int *numagrid)
{
// hardwire this for now
int numa_nodes = 1;
// get names of all nodes
int name_length;
@ -181,6 +177,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
}
procs_per_node = name_map.begin()->second;
procs_per_numa = procs_per_node / numa_nodes;
if (procs_per_numa < 1) procs_per_numa = 1;
delete [] node_names;
@ -192,6 +189,24 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
user_procgrid[2] > 1)
error->all(FLERR,"Could not create numa grid of processors");
// factorization for the grid of NUMA nodes
int node_count = nprocs / procs_per_numa;
int **nodefactors;
int nodepossible = factor(node_count,nullptr);
memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
nodepossible = factor(node_count,nodefactors);
if (domain->dimension == 2)
nodepossible = cull_2d(nodepossible,nodefactors,3);
nodepossible = cull_user(nodepossible,nodefactors,3,user_procgrid);
if (nodepossible == 0)
error->all(FLERR,"Could not create numa grid of processors");
best_factors(nodepossible,nodefactors,nodegrid,1,1,1);
// user settings for the factorization per numa node
// currently not user settable
// if user specifies 1 for a proc grid dimension,
@ -204,6 +219,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
if (user_procgrid[1] == 1) user_numagrid[1] = 1;
if (user_procgrid[2] == 1) user_numagrid[2] = 1;
// perform NUMA node factorization using subdomain sizes
// initial factorization within NUMA node
int **numafactors;
@ -218,38 +234,6 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
if (numapossible == 0)
error->all(FLERR,"Could not create numa grid of processors");
best_factors(numapossible,numafactors,numagrid,1,1,1);
// user_nodegrid = implied user constraints on nodes
int user_nodegrid[3];
user_nodegrid[0] = user_procgrid[0] / numagrid[0];
user_nodegrid[1] = user_procgrid[1] / numagrid[1];
user_nodegrid[2] = user_procgrid[2] / numagrid[2];
// factorization for the grid of NUMA nodes
int node_count = nprocs / procs_per_numa;
int **nodefactors;
int nodepossible = factor(node_count,nullptr);
memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
nodepossible = factor(node_count,nodefactors);
if (domain->dimension == 2)
nodepossible = cull_2d(nodepossible,nodefactors,3);
nodepossible = cull_user(nodepossible,nodefactors,3,user_nodegrid);
if (nodepossible == 0)
error->all(FLERR,"Could not create numa grid of processors");
best_factors(nodepossible,nodefactors,nodegrid,
numagrid[0],numagrid[1],numagrid[2]);
// repeat NUMA node factorization using subdomain sizes
// refines the factorization if the user specified the node layout
// NOTE: this will not re-enforce user-procgrid constraint will it?
best_factors(numapossible,numafactors,numagrid,
nodegrid[0],nodegrid[1],nodegrid[2]);
@ -270,6 +254,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
procgrid[0] = nodegrid[0] * numagrid[0];
procgrid[1] = nodegrid[1] * numagrid[1];
procgrid[2] = nodegrid[2] * numagrid[2];
}
/* ----------------------------------------------------------------------

View File

@ -24,7 +24,7 @@ class ProcMap : protected Pointers {
void onelevel_grid(int, int *, int *, int, int, int *, int *);
void twolevel_grid(int, int *, int *, int, int *, int *, int, int, int *, int *);
void numa_grid(int, int *, int *, int *);
void numa_grid(int, int, int *, int *, int *);
void custom_grid(char *, int, int *, int *);
void cart_map(int, int *, int *, int[3][2], int ***);
void cart_map(int, int *, int, int *, int *, int[3][2], int ***);