diff --git a/doc/src/processors.rst b/doc/src/processors.rst
index 921bbcc667..a11febb1c2 100644
--- a/doc/src/processors.rst
+++ b/doc/src/processors.rst
@@ -25,6 +25,8 @@ Syntax
            *numa* params = none
            *custom* params = infile
              infile = file containing grid layout
+       *numa_nodes* arg = Nn
+             Nn = number of numa domains per node
        *map* arg = *cart* or *cart/reorder* or *xyz* or *xzy* or *yxz* or *yzx* or *zxy* or *zyx*
           cart = use MPI_Cart() methods to map processors to 3d grid with reorder = 0
           cart/reorder = use MPI_Cart() methods to map processors to 3d grid with reorder = 1
@@ -159,24 +161,28 @@ surface-to-volume ratio of each processor's subdomain.
 
 The *numa* style operates similar to the *twolevel* keyword except
 that it auto-detects which cores are running on which nodes.
-Currently, it does this in only 2 levels, but it may be extended in
-the future to account for socket topology and other non-uniform memory
-access (NUMA) costs.  It also uses a different algorithm than the
-*twolevel* keyword for doing the two-level factorization of the
-simulation box into a 3d processor grid to minimize off-node
-communication, and it does its own MPI-based mapping of nodes and
+It will also subdivide the cores into numa domains. Currently, the
+number of numa domains is not autodetected and must be specified using
+the *numa_nodes* keyword; otherwise, the default value is used. The
+*numa* style uses a different algorithm than the *twolevel* keyword for
+doing the two-level factorization of the simulation box into a 3d
+processor grid to minimize off-node communication and communication
+across numa domains. It does its own MPI-based mapping of nodes and
 cores to the regular 3d grid.  Thus it may produce a different layout
 of the processors than the *twolevel* options.
 
 The *numa* style will give an error if the number of MPI processes is
 not divisible by the number of cores used per node, or any of the Px
-or Py of Pz values is greater than 1.
+or Py or Pz values is greater than 1.
 
 .. note::
 
    Unlike the *twolevel* style, the *numa* style does not require
-   any particular ordering of MPI ranks i norder to work correctly.  This
+   any particular ordering of MPI ranks in order to work correctly. This
    is because it auto-detects which processes are running on which nodes.
+   However, it assumes that the lowest ranks are in the first numa
+   domain, and so forth. MPI rank orderings that do not preserve this
+   property might result in more intranode communication between CPUs.
 
 The *custom* style uses the file *infile* to define both the 3d
 factorization and the mapping of processors to the grid.
@@ -207,6 +213,14 @@ any order, but no processor ID should appear more than once.
 
 ----------
 
+The *numa_nodes* keyword is used to specifiy the number of numa domains
+per node. It is currently only used by the *numa* style for two-level
+factorization to reduce the amount of MPI communications between CPUs.
+A good setting for this will typically be equal to the number of CPU
+sockets per node.
+
+----------
+
 The *map* keyword affects how the P processor IDs (from 0 to P-1) are
 mapped to the 3d grid of processors.  It is only used by the
 *onelevel* and *twolevel* grid settings.
@@ -356,5 +370,5 @@ Related commands
 Default
 """""""
 
-The option defaults are Px Py Pz = \* \* \*, grid = onelevel, and map =
-cart.
+The option defaults are Px Py Pz = \* \* \*, grid = onelevel, map =
+cart, and numa_nodes = 2.
diff --git a/src/comm.cpp b/src/comm.cpp
index 1293dd3d6d..02999fd541 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -420,6 +420,7 @@ void Comm::set_processors(int narg, char **arg)
     error->all(FLERR,"Specified processors != physical processors");
 
   int iarg = 3;
+  numa_nodes = 2;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"grid") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
@@ -514,6 +515,12 @@ void Comm::set_processors(int narg, char **arg)
       outfile = utils::strdup(arg[iarg+1]);
       iarg += 2;
 
+    } else if (strcmp(arg[iarg],"numa_nodes") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal processors command");
+      numa_nodes = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      if (numa_nodes < 1) numa_nodes = 2;
+      iarg += 2;
+
     } else error->all(FLERR,"Illegal processors command");
   }
 
@@ -565,7 +572,7 @@ void Comm::set_proc_grid(int outflag)
                         otherflag,other_style,other_procgrid,other_coregrid);
 
   } else if (gridflag == NUMA) {
-    pmap->numa_grid(nprocs,user_procgrid,procgrid,coregrid);
+    pmap->numa_grid(numa_nodes,nprocs,user_procgrid,procgrid,coregrid);
 
   } else if (gridflag == CUSTOM) {
     pmap->custom_grid(customfile,nprocs,user_procgrid,procgrid);
diff --git a/src/comm.h b/src/comm.h
index 5d803c1afa..fde4c3b81f 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -146,6 +146,7 @@ class Comm : protected Pointers {
   char xyz[4];         // xyz mapping of procs to 3d grid
   char *customfile;    // file with custom proc map
   char *outfile;       // proc grid/map output file
+  int numa_nodes;      // Number of numa domains per socket for 3d grid
 
   int otherflag;            // 1 if this partition dependent on another
   int other_style;          // style of dependency
diff --git a/src/procmap.cpp b/src/procmap.cpp
index 71e1cf5a6b..b520354f53 100644
--- a/src/procmap.cpp
+++ b/src/procmap.cpp
@@ -150,13 +150,9 @@ void ProcMap::twolevel_grid(int nprocs, int *user_procgrid, int *procgrid,
    auto-detects NUMA sockets within a multi-core node
 ------------------------------------------------------------------------- */
 
-void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
-                        int *numagrid)
+void ProcMap::numa_grid(int numa_nodes, int nprocs, int *user_procgrid,
+                        int *procgrid, int *numagrid)
 {
-  // hardwire this for now
-
-  int numa_nodes = 1;
-
   // get names of all nodes
 
   int name_length;
@@ -181,6 +177,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
   }
   procs_per_node = name_map.begin()->second;
   procs_per_numa = procs_per_node / numa_nodes;
+  if (procs_per_numa < 1) procs_per_numa = 1;
 
   delete [] node_names;
 
@@ -192,6 +189,24 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
       user_procgrid[2] > 1)
     error->all(FLERR,"Could not create numa grid of processors");
 
+  // factorization for the grid of NUMA nodes
+
+  int node_count = nprocs / procs_per_numa;
+
+  int **nodefactors;
+  int nodepossible = factor(node_count,nullptr);
+  memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
+  nodepossible = factor(node_count,nodefactors);
+
+  if (domain->dimension == 2)
+    nodepossible = cull_2d(nodepossible,nodefactors,3);
+  nodepossible = cull_user(nodepossible,nodefactors,3,user_procgrid);
+
+  if (nodepossible == 0)
+    error->all(FLERR,"Could not create numa grid of processors");
+
+  best_factors(nodepossible,nodefactors,nodegrid,1,1,1);
+
   // user settings for the factorization per numa node
   // currently not user settable
   // if user specifies 1 for a proc grid dimension,
@@ -204,6 +219,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
   if (user_procgrid[1] == 1) user_numagrid[1] = 1;
   if (user_procgrid[2] == 1) user_numagrid[2] = 1;
 
+  // perform NUMA node factorization using subdomain sizes
   // initial factorization within NUMA node
 
   int **numafactors;
@@ -218,38 +234,6 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
   if (numapossible == 0)
     error->all(FLERR,"Could not create numa grid of processors");
 
-  best_factors(numapossible,numafactors,numagrid,1,1,1);
-
-  // user_nodegrid = implied user constraints on nodes
-
-  int user_nodegrid[3];
-  user_nodegrid[0] = user_procgrid[0] / numagrid[0];
-  user_nodegrid[1] = user_procgrid[1] / numagrid[1];
-  user_nodegrid[2] = user_procgrid[2] / numagrid[2];
-
-  // factorization for the grid of NUMA nodes
-
-  int node_count = nprocs / procs_per_numa;
-
-  int **nodefactors;
-  int nodepossible = factor(node_count,nullptr);
-  memory->create(nodefactors,nodepossible,3,"procmap:nodefactors");
-  nodepossible = factor(node_count,nodefactors);
-
-  if (domain->dimension == 2)
-    nodepossible = cull_2d(nodepossible,nodefactors,3);
-  nodepossible = cull_user(nodepossible,nodefactors,3,user_nodegrid);
-
-  if (nodepossible == 0)
-    error->all(FLERR,"Could not create numa grid of processors");
-
-  best_factors(nodepossible,nodefactors,nodegrid,
-               numagrid[0],numagrid[1],numagrid[2]);
-
-  // repeat NUMA node factorization using subdomain sizes
-  // refines the factorization if the user specified the node layout
-  // NOTE: this will not re-enforce user-procgrid constraint will it?
-
   best_factors(numapossible,numafactors,numagrid,
                nodegrid[0],nodegrid[1],nodegrid[2]);
 
@@ -270,6 +254,7 @@ void ProcMap::numa_grid(int nprocs, int *user_procgrid, int *procgrid,
   procgrid[0] = nodegrid[0] * numagrid[0];
   procgrid[1] = nodegrid[1] * numagrid[1];
   procgrid[2] = nodegrid[2] * numagrid[2];
+
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/procmap.h b/src/procmap.h
index 06867837c6..2731aec984 100644
--- a/src/procmap.h
+++ b/src/procmap.h
@@ -24,7 +24,7 @@ class ProcMap : protected Pointers {
 
   void onelevel_grid(int, int *, int *, int, int, int *, int *);
   void twolevel_grid(int, int *, int *, int, int *, int *, int, int, int *, int *);
-  void numa_grid(int, int *, int *, int *);
+  void numa_grid(int, int, int *, int *, int *);
   void custom_grid(char *, int, int *, int *);
   void cart_map(int, int *, int *, int[3][2], int ***);
   void cart_map(int, int *, int, int *, int *, int[3][2], int ***);