From 3f215087d2e2c5546c7b8c96b0c307efbfd23408 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 13 Oct 2022 16:24:04 -0400 Subject: [PATCH] also print atom-step/s performance metric --- doc/src/Run_output.rst | 59 +++++++++++++++++++++--------------------- src/finish.cpp | 22 +++++++++++----- 2 files changed, 46 insertions(+), 35 deletions(-) diff --git a/doc/src/Run_output.rst b/doc/src/Run_output.rst index a988be94ad..28f66d6ae9 100644 --- a/doc/src/Run_output.rst +++ b/doc/src/Run_output.rst @@ -16,46 +16,47 @@ simulation. An example set of statistics is shown here: .. parsed-literal:: - Loop time of 2.81192 on 4 procs for 300 steps with 2004 atoms + Loop time of 0.942801 on 4 procs for 300 steps with 2004 atoms - Performance: 18.436 ns/day 1.302 hours/ns 106.689 timesteps/s - 97.0% CPU use with 4 MPI tasks x no OpenMP threads + Performance: 54.985 ns/day, 0.436 hours/ns, 318.201 timesteps/s, 637.674 katom-step/s + 195.2% CPU use with 2 MPI tasks x 2 OpenMP threads - MPI task timings breakdown: + MPI task timing breakdown: Section \| min time \| avg time \| max time \|%varavg\| %total --------------------------------------------------------------- - Pair \| 1.9808 \| 2.0134 \| 2.0318 \| 1.4 \| 71.60 - Bond \| 0.0021894 \| 0.0060319 \| 0.010058 \| 4.7 \| 0.21 - Kspace \| 0.3207 \| 0.3366 \| 0.36616 \| 3.1 \| 11.97 - Neigh \| 0.28411 \| 0.28464 \| 0.28516 \| 0.1 \| 10.12 - Comm \| 0.075732 \| 0.077018 \| 0.07883 \| 0.4 \| 2.74 - Output \| 0.00030518 \| 0.00042665 \| 0.00078821 \| 1.0 \| 0.02 - Modify \| 0.086606 \| 0.086631 \| 0.086668 \| 0.0 \| 3.08 - Other \| \| 0.007178 \| \| \| 0.26 + Pair \| 0.61419 \| 0.62872 \| 0.64325 \| 1.8 \| 66.69 + Bond \| 0.0028608 \| 0.0028899 \| 0.002919 \| 0.1 \| 0.31 + Kspace \| 0.12652 \| 0.14048 \| 0.15444 \| 3.7 \| 14.90 + Neigh \| 0.10242 \| 0.10242 \| 0.10242 \| 0.0 \| 10.86 + Comm \| 0.026753 \| 0.027593 \| 0.028434 \| 0.5 \| 2.93 + Output \| 0.00018341 \| 0.00030942 \| 0.00043542 \| 0.0 \| 0.03 + Modify \| 0.039117 \| 0.039348 \| 0.039579 \| 0.1 \| 4.17 + Other \| \| 0.001041 \| \| \| 0.11 - Nlocal: 501 ave 508 max 490 min - Histogram: 1 0 0 0 0 0 1 1 0 1 - Nghost: 6586.25 ave 6628 max 6548 min - Histogram: 1 0 1 0 0 0 1 0 0 1 - Neighs: 177007 ave 180562 max 170212 min - Histogram: 1 0 0 0 0 0 0 1 1 1 + Nlocal: 1002 ave 1006 max 998 min + Histogram: 1 0 0 0 0 0 0 0 0 1 + Nghost: 8670.5 ave 8691 max 8650 min + Histogram: 1 0 0 0 0 0 0 0 0 1 + Neighs: 354010 ave 357257 max 350763 min + Histogram: 1 0 0 0 0 0 0 0 0 1 - Total # of neighbors = 708028 - Ave neighs/atom = 353.307 - Ave special neighs/atom = 2.34032 + Total # of neighbors = 708020 + Ave neighs/atom = 353.30339 + Ave special neighs/atom = 2.3403194 Neighbor list builds = 26 Dangerous builds = 0 ---------- -The first section provides a global loop timing summary. The *loop -time* is the total wall-clock time for the simulation to run. The -*Performance* line is provided for convenience to help predict how -long it will take to run a desired physical simulation. The *CPU use* -line provides the CPU utilization per MPI task; it should be close to -100% times the number of OpenMP threads (or 1 of not using OpenMP). -Lower numbers correspond to delays due to file I/O or insufficient -thread utilization. +The first section provides a global loop timing summary. The *loop time* +is the total wall-clock time for the simulation to run. The +*Performance* line is provided for convenience to help predict how long +it will take to run a desired physical simulation and to have numbers +useful for performance comparison between different simulation settings +or system sizes. The *CPU use* line provides the CPU utilization per +MPI task; it should be close to 100% times the number of OpenMP threads +(or 1 of not using OpenMP). Lower numbers correspond to delays due to +file I/O or insufficient thread utilization. ---------- diff --git a/src/finish.cpp b/src/finish.cpp index b35fed0dde..2fe4e6fdb7 100644 --- a/src/finish.cpp +++ b/src/finish.cpp @@ -141,21 +141,31 @@ void Finish::end(int flag) (strcmp(update->unit_style,"real") == 0))) { double one_fs = force->femtosecond; double t_step = ((double) time_loop) / ((double) update->nsteps); - double step_t = 1.0/t_step; + double step_t = 1.0 / t_step; + double atomstep_s = (double)atom->natoms * step_t; + std::string atomstep_u = "atom-step/s"; + if (atomstep_s > 1000000.0) { + atomstep_u = "Matom-step/s"; + atomstep_s /= 1000000.0; + } else if (atomstep_s > 1000.0) { + atomstep_u = "katom-step/s"; + atomstep_s /= 1000.0; + } if (strcmp(update->unit_style,"lj") == 0) { double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs; - utils::logmesg(lmp,"Performance: {:.3f} tau/day, {:.3f} timesteps/s\n",tau_day,step_t); + utils::logmesg(lmp, "Performance: {:.3f} tau/day, {:.3f} timesteps/s, {:.3f} {}\n", + tau_day, step_t, atomstep_s, atomstep_u); } else if (strcmp(update->unit_style,"electron") == 0) { double hrs_fs = t_step / update->dt * one_fs / 3600.0; double fs_day = 24.0*3600.0 / t_step * update->dt / one_fs; - utils::logmesg(lmp,"Performance: {:.3f} fs/day, {:.3f} hours/fs, " - "{:.3f} timesteps/s\n",fs_day,hrs_fs,step_t); + utils::logmesg(lmp,"Performance: {:.3f} fs/day, {:.3f} hours/fs, {:.3f} timesteps/s, " + "{:.3f} {}\n", fs_day, hrs_fs, step_t, atomstep_s, atomstep_u); } else { double hrs_ns = t_step / update->dt * 1000000.0 * one_fs / 3600.0; double ns_day = 24.0*3600.0 / t_step * update->dt / one_fs/1000000.0; - utils::logmesg(lmp,"Performance: {:.3f} ns/day, {:.3f} hours/ns, " - "{:.3f} timesteps/s\n",ns_day,hrs_ns,step_t); + utils::logmesg(lmp,"Performance: {:.3f} ns/day, {:.3f} hours/ns, {:.3f} timesteps/s, " + "{:.3f} {}\n", ns_day, hrs_ns, step_t, atomstep_s, atomstep_u); } }