Add option to specify # of numa domains for processors grid numa style, simplify numa mapping with more agressive optimization to reduce comm between numa domains.

2024-03-07 07:07:16 -08:00
parent 554f53decb
commit 8ab2544f5b
5 changed files with 57 additions and 50 deletions
--- a/doc/src/processors.rst
+++ b/doc/src/processors.rst
@ -25,6 +25,8 @@ Syntax
           *numa* params = none
           *custom* params = infile
             infile = file containing grid layout
+       *numa_nodes* arg = Nn
+             Nn = number of numa domains per node
       *map* arg = *cart* or *cart/reorder* or *xyz* or *xzy* or *yxz* or *yzx* or *zxy* or *zyx*
          cart = use MPI_Cart() methods to map processors to 3d grid with reorder = 0
          cart/reorder = use MPI_Cart() methods to map processors to 3d grid with reorder = 1
@ -159,24 +161,28 @@ surface-to-volume ratio of each processor's subdomain.

 The *numa* style operates similar to the *twolevel* keyword except
 that it auto-detects which cores are running on which nodes.
-Currently, it does this in only 2 levels, but it may be extended in
-the future to account for socket topology and other non-uniform memory
-access (NUMA) costs.  It also uses a different algorithm than the
-*twolevel* keyword for doing the two-level factorization of the
-simulation box into a 3d processor grid to minimize off-node
-communication, and it does its own MPI-based mapping of nodes and
+It will also subdivide the cores into numa domains. Currently, the
+number of numa domains is not autodetected and must be specified using
+the *numa_nodes* keyword; otherwise, the default value is used. The
+*numa* style uses a different algorithm than the *twolevel* keyword for
+doing the two-level factorization of the simulation box into a 3d
+processor grid to minimize off-node communication and communication
+across numa domains. It does its own MPI-based mapping of nodes and
 cores to the regular 3d grid.  Thus it may produce a different layout
 of the processors than the *twolevel* options.

 The *numa* style will give an error if the number of MPI processes is
 not divisible by the number of cores used per node, or any of the Px
-or Py of Pz values is greater than 1.
+or Py or Pz values is greater than 1.

 .. note::

   Unlike the *twolevel* style, the *numa* style does not require
-   any particular ordering of MPI ranks i norder to work correctly.  This
+   any particular ordering of MPI ranks in order to work correctly. This
   is because it auto-detects which processes are running on which nodes.
+   However, it assumes that the lowest ranks are in the first numa
+   domain, and so forth. MPI rank orderings that do not preserve this
+   property might result in more intranode communication between CPUs.

 The *custom* style uses the file *infile* to define both the 3d
 factorization and the mapping of processors to the grid.
@ -207,6 +213,14 @@ any order, but no processor ID should appear more than once.

 ----------

+The *numa_nodes* keyword is used to specifiy the number of numa domains
+per node. It is currently only used by the *numa* style for two-level
+factorization to reduce the amount of MPI communications between CPUs.
+A good setting for this will typically be equal to the number of CPU
+sockets per node.
+
+----------
+
 The *map* keyword affects how the P processor IDs (from 0 to P-1) are
 mapped to the 3d grid of processors.  It is only used by the
 *onelevel* and *twolevel* grid settings.
@ -356,5 +370,5 @@ Related commands
 Default
 """""""

-The option defaults are Px Py Pz = \* \* \*, grid = onelevel, and map =
-cart.
+The option defaults are Px Py Pz = \* \* \*, grid = onelevel, map =
+cart, and numa_nodes = 2.
--- a/src/comm.cpp
+++ b/src/comm.cpp
@ -420,6 +420,7 @@ void Comm::set_processors(int narg, char **arg)
    error->all(FLERR,"Specified processors != physical processors");

  int iarg = 3;
+  numa_nodes = 2;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"grid") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
@ -514,6 +515,12 @@ void Comm::set_processors(int narg, char **arg)
      outfile = utils::strdup(arg[iarg+1]);
      iarg += 2;

+    } else if (strcmp(arg[iarg],"numa_nodes") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
+      numa_nodes = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      if (numa_nodes < 1) numa_nodes = 2;
+      iarg += 2;
+
    } else error->all(FLERR,"Illegal processors command");
  }

@ -565,7 +572,7 @@ void Comm::set_proc_grid(int outflag)
                        otherflag,other_style,other_procgrid,other_coregrid);

  } else if (gridflag == NUMA) {
-    pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
+    pmap->numa_grid(numa_nodes,nprocs,user_procgrid,procgrid,coregrid);

  } else if (gridflag == CUSTOM) {
    pmap->custom_grid(customfile,nprocs,user_procgrid,procgrid);
--- a/src/comm.h
+++ b/src/comm.h
@ -146,6 +146,7 @@ class Comm : protected Pointers {
  char xyz[4];         // xyz mapping of procs to 3d grid
  char *customfile;    // file with custom proc map
  char *outfile;       // proc grid/map output file
+  int numa_nodes;      // Number of numa domains per socket for 3d grid

  int otherflag;            // 1 if this partition dependent on another
  int other_style;          // style of dependency
--- a/src/procmap.cpp
+++ b/src/procmap.cpp
@ -150,13 +150,9 @@ void ProcMap::twolevel_grid(int nprocs, int *user_procgrid, int *procgrid,
   auto-detects NUMA sockets within a multi-core node
 ------------------------------------------------------------------------- */

-void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
-                        int *numagrid)
+void ProcMap::numa_grid(int numa_nodes, int nprocs, int *user_procgrid,
+                        int *procgrid, int *numagrid)
 {
-  // hardwire this for now
-
-  int numa_nodes = 1;
-
  // get names of all nodes

  int name_length;
@ -181,6 +177,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
  }
  procs_per_node = name_map.begin()->second;
  procs_per_numa = procs_per_node / numa_nodes;
+  if (procs_per_numa < 1) procs_per_numa = 1;

  delete [] node_names;

@ -192,6 +189,24 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
      user_procgrid[2] > 1)
    error->all(FLERR,"Could not create numa grid of processors");

+  // factorization for the grid of NUMA nodes
+
+  int node_count = nprocs / procs_per_numa;
+
+  int **nodefactors;
+  int nodepossible = factor(node_count,nullptr);
+  memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
+  nodepossible = factor(node_count,nodefactors);
+
+  if (domain->dimension == 2)
+    nodepossible = cull_2d(nodepossible,nodefactors,3);
+  nodepossible = cull_user(nodepossible,nodefactors,3,user_procgrid);
+
+  if (nodepossible == 0)
+    error->all(FLERR,"Could not create numa grid of processors");
+
+  best_factors(nodepossible,nodefactors,nodegrid,1,1,1);
+
  // user settings for the factorization per numa node
  // currently not user settable
  // if user specifies 1 for a proc grid dimension,
@ -204,6 +219,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
  if (user_procgrid[1] == 1) user_numagrid[1] = 1;
  if (user_procgrid[2] == 1) user_numagrid[2] = 1;

+  // perform NUMA node factorization using subdomain sizes
  // initial factorization within NUMA node

  int **numafactors;
@ -218,38 +234,6 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
  if (numapossible == 0)
    error->all(FLERR,"Could not create numa grid of processors");

-  best_factors(numapossible,numafactors,numagrid,1,1,1);
-
-  // user_nodegrid = implied user constraints on nodes
-
-  int user_nodegrid[3];
-  user_nodegrid[0] = user_procgrid[0] / numagrid[0];
-  user_nodegrid[1] = user_procgrid[1] / numagrid[1];
-  user_nodegrid[2] = user_procgrid[2] / numagrid[2];
-
-  // factorization for the grid of NUMA nodes
-
-  int node_count = nprocs / procs_per_numa;
-
-  int **nodefactors;
-  int nodepossible = factor(node_count,nullptr);
-  memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
-  nodepossible = factor(node_count,nodefactors);
-
-  if (domain->dimension == 2)
-    nodepossible = cull_2d(nodepossible,nodefactors,3);
-  nodepossible = cull_user(nodepossible,nodefactors,3,user_nodegrid);
-
-  if (nodepossible == 0)
-    error->all(FLERR,"Could not create numa grid of processors");
-
-  best_factors(nodepossible,nodefactors,nodegrid,
-               numagrid[0],numagrid[1],numagrid[2]);
-
-  // repeat NUMA node factorization using subdomain sizes
-  // refines the factorization if the user specified the node layout
-  // NOTE: this will not re-enforce user-procgrid constraint will it?
-
  best_factors(numapossible,numafactors,numagrid,
               nodegrid[0],nodegrid[1],nodegrid[2]);

@ -270,6 +254,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
  procgrid[0] = nodegrid[0] * numagrid[0];
  procgrid[1] = nodegrid[1] * numagrid[1];
  procgrid[2] = nodegrid[2] * numagrid[2];
+
 }

 /* ----------------------------------------------------------------------
--- a/src/procmap.h
+++ b/src/procmap.h
@ -24,7 +24,7 @@ class ProcMap : protected Pointers {

  void onelevel_grid(int, int *, int *, int, int, int *, int *);
  void twolevel_grid(int, int *, int *, int, int *, int *, int, int, int *, int *);
-  void numa_grid(int, int *, int *, int *);
+  void numa_grid(int, int, int *, int *, int *);
  void custom_grid(char *, int, int *, int *);
  void cart_map(int, int *, int *, int[3][2], int ***);
  void cart_map(int, int *, int, int *, int *, int[3][2], int ***);