diff --git a/doc/src/processors.rst b/doc/src/processors.rst index 921bbcc667..a11febb1c2 100644 --- a/doc/src/processors.rst +++ b/doc/src/processors.rst @@ -25,6 +25,8 @@ Syntax *numa* params = none *custom* params = infile infile = file containing grid layout + *numa_nodes* arg = Nn + Nn = number of numa domains per node *map* arg = *cart* or *cart/reorder* or *xyz* or *xzy* or *yxz* or *yzx* or *zxy* or *zyx* cart = use MPI_Cart() methods to map processors to 3d grid with reorder = 0 cart/reorder = use MPI_Cart() methods to map processors to 3d grid with reorder = 1 @@ -159,24 +161,28 @@ surface-to-volume ratio of each processor's subdomain. The *numa* style operates similar to the *twolevel* keyword except that it auto-detects which cores are running on which nodes. -Currently, it does this in only 2 levels, but it may be extended in -the future to account for socket topology and other non-uniform memory -access (NUMA) costs. It also uses a different algorithm than the -*twolevel* keyword for doing the two-level factorization of the -simulation box into a 3d processor grid to minimize off-node -communication, and it does its own MPI-based mapping of nodes and +It will also subdivide the cores into numa domains. Currently, the +number of numa domains is not autodetected and must be specified using +the *numa_nodes* keyword; otherwise, the default value is used. The +*numa* style uses a different algorithm than the *twolevel* keyword for +doing the two-level factorization of the simulation box into a 3d +processor grid to minimize off-node communication and communication +across numa domains. It does its own MPI-based mapping of nodes and cores to the regular 3d grid. Thus it may produce a different layout of the processors than the *twolevel* options. The *numa* style will give an error if the number of MPI processes is not divisible by the number of cores used per node, or any of the Px -or Py of Pz values is greater than 1. +or Py or Pz values is greater than 1. .. note:: Unlike the *twolevel* style, the *numa* style does not require - any particular ordering of MPI ranks i norder to work correctly. This + any particular ordering of MPI ranks in order to work correctly. This is because it auto-detects which processes are running on which nodes. + However, it assumes that the lowest ranks are in the first numa + domain, and so forth. MPI rank orderings that do not preserve this + property might result in more intranode communication between CPUs. The *custom* style uses the file *infile* to define both the 3d factorization and the mapping of processors to the grid. @@ -207,6 +213,14 @@ any order, but no processor ID should appear more than once. ---------- +The *numa_nodes* keyword is used to specifiy the number of numa domains +per node. It is currently only used by the *numa* style for two-level +factorization to reduce the amount of MPI communications between CPUs. +A good setting for this will typically be equal to the number of CPU +sockets per node. + +---------- + The *map* keyword affects how the P processor IDs (from 0 to P-1) are mapped to the 3d grid of processors. It is only used by the *onelevel* and *twolevel* grid settings. @@ -356,5 +370,5 @@ Related commands Default """"""" -The option defaults are Px Py Pz = \* \* \*, grid = onelevel, and map = -cart. +The option defaults are Px Py Pz = \* \* \*, grid = onelevel, map = +cart, and numa_nodes = 2. diff --git a/src/comm.cpp b/src/comm.cpp index 1293dd3d6d..02999fd541 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -420,6 +420,7 @@ void Comm::set_processors(int narg, char **arg) error->all(FLERR,"Specified processors != physical processors"); int iarg = 3; + numa_nodes = 2; while (iarg < narg) { if (strcmp(arg[iarg],"grid") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal processors command"); @@ -514,6 +515,12 @@ void Comm::set_processors(int narg, char **arg) outfile = utils::strdup(arg[iarg+1]); iarg += 2; + } else if (strcmp(arg[iarg],"numa_nodes") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal processors command"); + numa_nodes = utils::inumeric(FLERR,arg[iarg+1],false,lmp); + if (numa_nodes < 1) numa_nodes = 2; + iarg += 2; + } else error->all(FLERR,"Illegal processors command"); } @@ -565,7 +572,7 @@ void Comm::set_proc_grid(int outflag) otherflag,other_style,other_procgrid,other_coregrid); } else if (gridflag == NUMA) { - pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid); + pmap->numa_grid(numa_nodes,nprocs,user_procgrid,procgrid,coregrid); } else if (gridflag == CUSTOM) { pmap->custom_grid(customfile,nprocs,user_procgrid,procgrid); diff --git a/src/comm.h b/src/comm.h index 5d803c1afa..fde4c3b81f 100644 --- a/src/comm.h +++ b/src/comm.h @@ -146,6 +146,7 @@ class Comm : protected Pointers { char xyz[4]; // xyz mapping of procs to 3d grid char *customfile; // file with custom proc map char *outfile; // proc grid/map output file + int numa_nodes; // Number of numa domains per socket for 3d grid int otherflag; // 1 if this partition dependent on another int other_style; // style of dependency diff --git a/src/procmap.cpp b/src/procmap.cpp index 71e1cf5a6b..b520354f53 100644 --- a/src/procmap.cpp +++ b/src/procmap.cpp @@ -150,13 +150,9 @@ void ProcMap::twolevel_grid(int nprocs, int *user_procgrid, int *procgrid, auto-detects NUMA sockets within a multi-core node ------------------------------------------------------------------------- */ -void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid, - int *numagrid) +void ProcMap::numa_grid(int numa_nodes, int nprocs, int *user_procgrid, + int *procgrid, int *numagrid) { - // hardwire this for now - - int numa_nodes = 1; - // get names of all nodes int name_length; @@ -181,6 +177,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid, } procs_per_node = name_map.begin()->second; procs_per_numa = procs_per_node / numa_nodes; + if (procs_per_numa < 1) procs_per_numa = 1; delete [] node_names; @@ -192,6 +189,24 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid, user_procgrid[2] > 1) error->all(FLERR,"Could not create numa grid of processors"); + // factorization for the grid of NUMA nodes + + int node_count = nprocs / procs_per_numa; + + int **nodefactors; + int nodepossible = factor(node_count,nullptr); + memory->create(nodefactors,nodepossible,3,"procmap:nodefactors"); + nodepossible = factor(node_count,nodefactors); + + if (domain->dimension == 2) + nodepossible = cull_2d(nodepossible,nodefactors,3); + nodepossible = cull_user(nodepossible,nodefactors,3,user_procgrid); + + if (nodepossible == 0) + error->all(FLERR,"Could not create numa grid of processors"); + + best_factors(nodepossible,nodefactors,nodegrid,1,1,1); + // user settings for the factorization per numa node // currently not user settable // if user specifies 1 for a proc grid dimension, @@ -204,6 +219,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid, if (user_procgrid[1] == 1) user_numagrid[1] = 1; if (user_procgrid[2] == 1) user_numagrid[2] = 1; + // perform NUMA node factorization using subdomain sizes // initial factorization within NUMA node int **numafactors; @@ -218,38 +234,6 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid, if (numapossible == 0) error->all(FLERR,"Could not create numa grid of processors"); - best_factors(numapossible,numafactors,numagrid,1,1,1); - - // user_nodegrid = implied user constraints on nodes - - int user_nodegrid[3]; - user_nodegrid[0] = user_procgrid[0] / numagrid[0]; - user_nodegrid[1] = user_procgrid[1] / numagrid[1]; - user_nodegrid[2] = user_procgrid[2] / numagrid[2]; - - // factorization for the grid of NUMA nodes - - int node_count = nprocs / procs_per_numa; - - int **nodefactors; - int nodepossible = factor(node_count,nullptr); - memory->create(nodefactors,nodepossible,3,"procmap:nodefactors"); - nodepossible = factor(node_count,nodefactors); - - if (domain->dimension == 2) - nodepossible = cull_2d(nodepossible,nodefactors,3); - nodepossible = cull_user(nodepossible,nodefactors,3,user_nodegrid); - - if (nodepossible == 0) - error->all(FLERR,"Could not create numa grid of processors"); - - best_factors(nodepossible,nodefactors,nodegrid, - numagrid[0],numagrid[1],numagrid[2]); - - // repeat NUMA node factorization using subdomain sizes - // refines the factorization if the user specified the node layout - // NOTE: this will not re-enforce user-procgrid constraint will it? - best_factors(numapossible,numafactors,numagrid, nodegrid[0],nodegrid[1],nodegrid[2]); @@ -270,6 +254,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid, procgrid[0] = nodegrid[0] * numagrid[0]; procgrid[1] = nodegrid[1] * numagrid[1]; procgrid[2] = nodegrid[2] * numagrid[2]; + } /* ---------------------------------------------------------------------- diff --git a/src/procmap.h b/src/procmap.h index 06867837c6..2731aec984 100644 --- a/src/procmap.h +++ b/src/procmap.h @@ -24,7 +24,7 @@ class ProcMap : protected Pointers { void onelevel_grid(int, int *, int *, int, int, int *, int *); void twolevel_grid(int, int *, int *, int, int *, int *, int, int, int *, int *); - void numa_grid(int, int *, int *, int *); + void numa_grid(int, int, int *, int *, int *); void custom_grid(char *, int, int *, int *); void cart_map(int, int *, int *, int[3][2], int ***); void cart_map(int, int *, int, int *, int *, int[3][2], int ***);