From 3f215087d2e2c5546c7b8c96b0c307efbfd23408 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 13 Oct 2022 16:24:04 -0400
Subject: [PATCH] also print atom-step/s performance metric

---
 doc/src/Run_output.rst | 59 +++++++++++++++++++++---------------------
 src/finish.cpp         | 22 +++++++++++-----
 2 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/doc/src/Run_output.rst b/doc/src/Run_output.rst
index a988be94ad..28f66d6ae9 100644
--- a/doc/src/Run_output.rst
+++ b/doc/src/Run_output.rst
@@ -16,46 +16,47 @@ simulation.  An example set of statistics is shown here:
 
 .. parsed-literal::
 
-   Loop time of 2.81192 on 4 procs for 300 steps with 2004 atoms
+   Loop time of 0.942801 on 4 procs for 300 steps with 2004 atoms
 
-   Performance: 18.436 ns/day  1.302 hours/ns  106.689 timesteps/s
-   97.0% CPU use with 4 MPI tasks x no OpenMP threads
+   Performance: 54.985 ns/day, 0.436 hours/ns, 318.201 timesteps/s, 637.674 katom-step/s
+   195.2% CPU use with 2 MPI tasks x 2 OpenMP threads
 
-   MPI task timings breakdown:
+   MPI task timing breakdown:
    Section \|  min time  \|  avg time  \|  max time  \|%varavg\| %total
    ---------------------------------------------------------------
-   Pair    \| 1.9808     \| 2.0134     \| 2.0318     \|   1.4 \| 71.60
-   Bond    \| 0.0021894  \| 0.0060319  \| 0.010058   \|   4.7 \|  0.21
-   Kspace  \| 0.3207     \| 0.3366     \| 0.36616    \|   3.1 \| 11.97
-   Neigh   \| 0.28411    \| 0.28464    \| 0.28516    \|   0.1 \| 10.12
-   Comm    \| 0.075732   \| 0.077018   \| 0.07883    \|   0.4 \|  2.74
-   Output  \| 0.00030518 \| 0.00042665 \| 0.00078821 \|   1.0 \|  0.02
-   Modify  \| 0.086606   \| 0.086631   \| 0.086668   \|   0.0 \|  3.08
-   Other   \|            \| 0.007178   \|            \|       \|  0.26
+   Pair    \| 0.61419    \| 0.62872    \| 0.64325    \|   1.8 \| 66.69
+   Bond    \| 0.0028608  \| 0.0028899  \| 0.002919   \|   0.1 \|  0.31
+   Kspace  \| 0.12652    \| 0.14048    \| 0.15444    \|   3.7 \| 14.90
+   Neigh   \| 0.10242    \| 0.10242    \| 0.10242    \|   0.0 \| 10.86
+   Comm    \| 0.026753   \| 0.027593   \| 0.028434   \|   0.5 \|  2.93
+   Output  \| 0.00018341 \| 0.00030942 \| 0.00043542 \|   0.0 \|  0.03
+   Modify  \| 0.039117   \| 0.039348   \| 0.039579   \|   0.1 \|  4.17
+   Other   \|            \| 0.001041   \|            \|       \|  0.11
 
-   Nlocal:    501 ave 508 max 490 min
-   Histogram: 1 0 0 0 0 0 1 1 0 1
-   Nghost:    6586.25 ave 6628 max 6548 min
-   Histogram: 1 0 1 0 0 0 1 0 0 1
-   Neighs:    177007 ave 180562 max 170212 min
-   Histogram: 1 0 0 0 0 0 0 1 1 1
+   Nlocal:           1002 ave        1006 max         998 min
+   Histogram: 1 0 0 0 0 0 0 0 0 1
+   Nghost:         8670.5 ave        8691 max        8650 min
+   Histogram: 1 0 0 0 0 0 0 0 0 1
+   Neighs:         354010 ave      357257 max      350763 min
+   Histogram: 1 0 0 0 0 0 0 0 0 1
 
-   Total # of neighbors = 708028
-   Ave neighs/atom = 353.307
-   Ave special neighs/atom = 2.34032
+   Total # of neighbors = 708020
+   Ave neighs/atom = 353.30339
+   Ave special neighs/atom = 2.3403194
    Neighbor list builds = 26
    Dangerous builds = 0
 
 ----------
 
-The first section provides a global loop timing summary. The *loop
-time* is the total wall-clock time for the simulation to run.  The
-*Performance* line is provided for convenience to help predict how
-long it will take to run a desired physical simulation.  The *CPU use*
-line provides the CPU utilization per MPI task; it should be close to
-100% times the number of OpenMP threads (or 1 of not using OpenMP).
-Lower numbers correspond to delays due to file I/O or insufficient
-thread utilization.
+The first section provides a global loop timing summary. The *loop time*
+is the total wall-clock time for the simulation to run.  The
+*Performance* line is provided for convenience to help predict how long
+it will take to run a desired physical simulation and to have numbers
+useful for performance comparison between different simulation settings
+or system sizes.  The *CPU use* line provides the CPU utilization per
+MPI task; it should be close to 100% times the number of OpenMP threads
+(or 1 of not using OpenMP).  Lower numbers correspond to delays due to
+file I/O or insufficient thread utilization.
 
 ----------
 
diff --git a/src/finish.cpp b/src/finish.cpp
index b35fed0dde..2fe4e6fdb7 100644
--- a/src/finish.cpp
+++ b/src/finish.cpp
@@ -141,21 +141,31 @@ void Finish::end(int flag)
             (strcmp(update->unit_style,"real") == 0))) {
         double one_fs = force->femtosecond;
         double t_step = ((double) time_loop) / ((double) update->nsteps);
-        double step_t = 1.0/t_step;
+        double step_t = 1.0 / t_step;
+        double atomstep_s = (double)atom->natoms * step_t;
+        std::string atomstep_u = "atom-step/s";
+        if (atomstep_s > 1000000.0) {
+          atomstep_u = "Matom-step/s";
+          atomstep_s /= 1000000.0;
+        } else if (atomstep_s > 1000.0) {
+          atomstep_u = "katom-step/s";
+          atomstep_s /= 1000.0;
+        }
 
         if (strcmp(update->unit_style,"lj") == 0) {
           double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs;
-          utils::logmesg(lmp,"Performance: {:.3f} tau/day, {:.3f} timesteps/s\n",tau_day,step_t);
+          utils::logmesg(lmp, "Performance: {:.3f} tau/day, {:.3f} timesteps/s, {:.3f} {}\n",
+                         tau_day, step_t, atomstep_s, atomstep_u);
         } else if (strcmp(update->unit_style,"electron") == 0) {
           double hrs_fs = t_step / update->dt * one_fs / 3600.0;
           double fs_day = 24.0*3600.0 / t_step * update->dt / one_fs;
-          utils::logmesg(lmp,"Performance: {:.3f} fs/day, {:.3f} hours/fs, "
-                         "{:.3f} timesteps/s\n",fs_day,hrs_fs,step_t);
+          utils::logmesg(lmp,"Performance: {:.3f} fs/day, {:.3f} hours/fs, {:.3f} timesteps/s, "
+                         "{:.3f} {}\n", fs_day, hrs_fs, step_t, atomstep_s, atomstep_u);
         } else {
           double hrs_ns = t_step / update->dt * 1000000.0 * one_fs / 3600.0;
           double ns_day = 24.0*3600.0 / t_step * update->dt / one_fs/1000000.0;
-          utils::logmesg(lmp,"Performance: {:.3f} ns/day, {:.3f} hours/ns, "
-                         "{:.3f} timesteps/s\n",ns_day,hrs_ns,step_t);
+          utils::logmesg(lmp,"Performance: {:.3f} ns/day, {:.3f} hours/ns, {:.3f} timesteps/s, "
+                         "{:.3f} {}\n", ns_day, hrs_ns, step_t, atomstep_s, atomstep_u);
         }
       }