/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://lammps.sandia.gov/, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ #include "finish.h" #include "accelerator_kokkos.h" #include "atom.h" #include "atom_vec.h" #include "comm.h" #include "error.h" #include "force.h" #include "kspace.h" #include "memory.h" // IWYU pragma: keep #include "min.h" #include "molecule.h" #include "neigh_list.h" #include "neigh_request.h" #include "neighbor.h" // IWYU pragma: keep #include "timer.h" // IWYU pragma: keep #include "universe.h" #include "update.h" #include #include #ifdef LMP_USER_OMP #include "modify.h" #include "fix_omp.h" #include "thr_data.h" #endif using namespace LAMMPS_NS; // local function prototypes, code at end of file static void mpi_timings(const char *label, Timer *t, enum Timer::ttype tt, MPI_Comm world, const int nprocs, const int nthreads, const int me, double time_loop, FILE *scr, FILE *log); #ifdef LMP_USER_OMP static void omp_times(FixOMP *fix, const char *label, enum Timer::ttype which, const int nthreads,FILE *scr, FILE *log); #endif /* ---------------------------------------------------------------------- */ Finish::Finish(LAMMPS *lmp) : Pointers(lmp) {} /* ---------------------------------------------------------------------- */ void Finish::end(int flag) { int i,m,nneigh,nneighfull; int histo[10]; int minflag,prdflag,tadflag,hyperflag; int timeflag,fftflag,histoflag,neighflag; double time,tmp,ave,max,min; double time_loop,time_other,cpu_loop; int me,nprocs; MPI_Comm_rank(world,&me); MPI_Comm_size(world,&nprocs); const int nthreads = comm->nthreads; // recompute natoms in case atoms have been lost bigint nblocal = atom->nlocal; MPI_Allreduce(&nblocal,&atom->natoms,1,MPI_LMP_BIGINT,MPI_SUM,world); // choose flavors of statistical output // flag determines caller // flag = 0 = just loop summary // flag = 1 = dynamics or minimization // flag = 2 = PRD // flag = 3 = TAD // flag = 4 = HYPER // turn off neighflag for Kspace partition of verlet/split integrator minflag = prdflag = tadflag = hyperflag = 0; timeflag = fftflag = histoflag = neighflag = 0; time_loop = cpu_loop = time_other = 0.0; if (flag == 1) { if (update->whichflag == 2) minflag = 1; timeflag = histoflag = 1; neighflag = 1; if (update->whichflag == 1 && strncmp(update->integrate_style,"verlet/split",12) == 0 && universe->iworld == 1) neighflag = 0; if (force->kspace && force->kspace_match("^pppm",0) && force->kspace->fftbench) fftflag = 1; } if (flag == 2) prdflag = timeflag = histoflag = neighflag = 1; if (flag == 3) tadflag = histoflag = neighflag = 1; if (flag == 4) hyperflag = timeflag = histoflag = neighflag = 1; // loop stats if (timer->has_loop()) { // overall loop time time_loop = timer->get_wall(Timer::TOTAL); cpu_loop = timer->get_cpu(Timer::TOTAL); MPI_Allreduce(&time_loop,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time_loop = tmp/nprocs; MPI_Allreduce(&cpu_loop,&tmp,1,MPI_DOUBLE,MPI_SUM,world); cpu_loop = tmp/nprocs; if (time_loop > 0.0) cpu_loop = cpu_loop/time_loop*100.0; if (me == 0) { int ntasks = nprocs * nthreads; utils::logmesg(lmp,fmt::format("Loop time of {:.6g} on {} procs for " "{} steps with {} atoms\n\n",time_loop, ntasks,update->nsteps,atom->natoms)); // Gromacs/NAMD-style performance metric for suitable unit settings if ( timeflag && !minflag && !prdflag && !tadflag && (update->nsteps > 0) && (update->dt != 0.0) && ((strcmp(update->unit_style,"lj") == 0) || (strcmp(update->unit_style,"metal") == 0) || (strcmp(update->unit_style,"micro") == 0) || (strcmp(update->unit_style,"nano") == 0) || (strcmp(update->unit_style,"electron") == 0) || (strcmp(update->unit_style,"real") == 0))) { double one_fs = force->femtosecond; double t_step = ((double) time_loop) / ((double) update->nsteps); double step_t = 1.0/t_step; if (strcmp(update->unit_style,"lj") == 0) { double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs; utils::logmesg(lmp,fmt::format("Performance: {:.3f} tau/day, {:.3f} " "timesteps/s\n",tau_day,step_t)); } else if (strcmp(update->unit_style,"electron") == 0) { double hrs_fs = t_step / update->dt * one_fs / 3600.0; double fs_day = 24.0*3600.0 / t_step * update->dt / one_fs; utils::logmesg(lmp,fmt::format("Performance: {:.3f} fs/day, {:.3f} " "hours/fs, {:.3f} timesteps/s\n", fs_day,hrs_fs,step_t)); } else { double hrs_ns = t_step / update->dt * 1000000.0 * one_fs / 3600.0; double ns_day = 24.0*3600.0 / t_step * update->dt / one_fs/1000000.0; utils::logmesg(lmp,fmt::format("Performance: {:.3f} ns/day, {:.3f} " "hours/ns, {:.3f} timesteps/s\n", ns_day,hrs_ns,step_t)); } } // CPU use on MPI tasks and OpenMP threads if (timeflag) { if (lmp->kokkos) { utils::logmesg(lmp,fmt::format("{:.1f}% CPU use with {} MPI tasks " "x {} OpenMP threads\n",cpu_loop,nprocs, lmp->kokkos->nthreads)); } else { #if defined(_OPENMP) utils::logmesg(lmp,fmt::format("{:.1f}% CPU use with {} MPI tasks " "x {} OpenMP threads\n", cpu_loop,nprocs,nthreads)); #else utils::logmesg(lmp,fmt::format("{:.1f}% CPU use with {} MPI tasks " "x no OpenMP threads\n",cpu_loop,nprocs)); #endif } } } } // avoid division by zero for very short runs if (time_loop == 0.0) time_loop = 1.0; if (cpu_loop == 0.0) cpu_loop = 100.0; // get "Other" wall time for later use if (timer->has_normal()) time_other = timer->get_wall(Timer::TOTAL) - timer->get_wall(Timer::ALL); // minimization stats if (minflag) { if (me == 0) { std::string mesg = "\nMinimization stats:\n"; mesg += fmt::format(" Stopping criterion = {}\n", update->minimize->stopstr); mesg += fmt::format(" Energy initial, next-to-last, final = \n" " {:18.15g} {:18.15g} {:18.15g}\n", update->minimize->einitial, update->minimize->eprevious, update->minimize->efinal); mesg += fmt::format(" Force two-norm initial, final = {:.8} {:.8}\n", update->minimize->fnorm2_init,update->minimize->fnorm2_final); mesg += fmt::format(" Force max component initial, final = {:.8} {:.8}\n", update->minimize->fnorminf_init, update->minimize->fnorminf_final); mesg += fmt::format(" Final line search alpha, max atom move = {:.8} {:.8}\n", update->minimize->alpha_final, update->minimize->alpha_final* update->minimize->fnorminf_final); mesg += fmt::format(" Iterations, force evaluations = {} {}\n", update->minimize->niter,update->minimize->neval); utils::logmesg(lmp,mesg); } } // PRD stats if (prdflag) { if (me == 0) utils::logmesg(lmp,"\nPRD stats:\n"); time = timer->get_wall(Timer::DEPHASE); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Dephase time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::DYNAMICS); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Dynamics time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::QUENCH); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Quench time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::REPCOMM); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Comm time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::REPOUT); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Output time (%) = {} ({})\n", time,time/time_loop*100.0)); time = time_other; MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Other time (%) = {} ({})\n", time,time/time_loop*100.0)); } // TAD stats if (tadflag) { if (me == 0) utils::logmesg(lmp,"\nTAD stats:\n"); time = timer->get_wall(Timer::NEB); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" NEB time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::DYNAMICS); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Dynamics time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::QUENCH); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Quench time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::REPCOMM); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Comm time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::REPOUT); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Output time (%) = {} ({})\n", time,time/time_loop*100.0)); time = time_other; MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Other time (%) = {} ({})\n", time,time/time_loop*100.0)); } // HYPER stats if (hyperflag) { if (me == 0) utils::logmesg(lmp,"\nHyper stats:\n"); time = timer->get_wall(Timer::DYNAMICS); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Dynamics time (%) = {} ({})\n", time,time/time_loop*100.0)); time = timer->get_wall(Timer::QUENCH); MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Quench time (%) = {} ({})\n", time,time/time_loop*100.0)); time = time_other; MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) utils::logmesg(lmp,fmt::format(" Other time (%) = {} ({})\n", time,time/time_loop*100.0)); } // further timing breakdowns if (timeflag && timer->has_normal()) { if (me == 0) { if (timer->has_full()) utils::logmesg(lmp,"\nMPI task timing breakdown:\nSection | min time " " | avg time | max time |%varavg| %CPU | %total\n" "-----------------------------------------------------" "------------------\n"); else utils::logmesg(lmp,"\nMPI task timing breakdown:\nSection | min time " " | avg time | max time |%varavg| %total\n---------" "------------------------------------------------------\n"); } mpi_timings("Pair",timer,Timer::PAIR, world,nprocs, nthreads,me,time_loop,screen,logfile); if (atom->molecular != Atom::ATOMIC) mpi_timings("Bond",timer,Timer::BOND,world,nprocs, nthreads,me,time_loop,screen,logfile); if (force->kspace) mpi_timings("Kspace",timer,Timer::KSPACE,world,nprocs, nthreads,me,time_loop,screen,logfile); mpi_timings("Neigh",timer,Timer::NEIGH,world,nprocs, nthreads,me,time_loop,screen,logfile); mpi_timings("Comm",timer,Timer::COMM,world,nprocs, nthreads,me,time_loop,screen,logfile); mpi_timings("Output",timer,Timer::OUTPUT,world,nprocs, nthreads,me,time_loop,screen,logfile); mpi_timings("Modify",timer,Timer::MODIFY,world,nprocs, nthreads,me,time_loop,screen,logfile); if (timer->has_sync()) mpi_timings("Sync",timer,Timer::SYNC,world,nprocs, nthreads,me,time_loop,screen,logfile); time = time_other; MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; if (me == 0) { if (timer->has_full()) utils::logmesg(lmp,fmt::format("Other | | {:<10.4g} | " " | | |{:6.2f}\n", time,time/time_loop*100.0)); else utils::logmesg(lmp,fmt::format("Other | | {:<10.4g} | " " | |{:6.2f}\n", time,time/time_loop*100.0)); } } #ifdef LMP_USER_OMP int ifix = modify->find_fix("package_omp"); // print thread breakdown only with full timer detail if ((ifix >= 0) && timer->has_full() && me == 0) { double thr_total = 0.0; ThrData *td; FixOMP *fixomp = static_cast(lmp->modify->fix[ifix]); for (i=0; i < nthreads; ++i) { td = fixomp->get_thr(i); thr_total += td->get_time(Timer::ALL); } thr_total /= (double) nthreads; if (thr_total > 0.0) { const std::string thr_fmt = "\nThread timing breakdown (MPI rank {}):\nTotal threaded time {:.4g} / {:.1f}%\n" "Section | min time | avg time | max time |%varavg| %total\n" "---------------------------------------------------------------\n"; utils::logmesg(lmp,fmt::format(thr_fmt,me,thr_total,thr_total/time_loop*100.0)); omp_times(fixomp,"Pair",Timer::PAIR,nthreads,screen,logfile); if (atom->molecular != Atom::ATOMIC) omp_times(fixomp,"Bond",Timer::BOND,nthreads,screen,logfile); if (force->kspace) omp_times(fixomp,"Kspace",Timer::KSPACE,nthreads,screen,logfile); omp_times(fixomp,"Neigh",Timer::NEIGH,nthreads,screen,logfile); omp_times(fixomp,"Reduce",Timer::COMM,nthreads,screen,logfile); } } #endif if ((comm->me == 0) && lmp->kokkos && (lmp->kokkos->ngpus > 0)) if (const char* env_clb = getenv("CUDA_LAUNCH_BLOCKING")) if (!(strcmp(env_clb,"1") == 0)) { error->warning(FLERR,"Timing breakdown may not be accurate " "since GPU/CPU overlap is enabled\n" "Using 'export CUDA_LAUNCH_BLOCKING=1' will give an " "accurate timing breakdown but will reduce performance"); } // FFT timing statistics // time3d,time1d = total time during run for 3d and 1d FFTs // loop on timing() until nsample FFTs require at least 1.0 CPU sec // time_kspace may be 0.0 if another partition is doing Kspace if (fftflag) { if (me == 0) utils::logmesg(lmp,"\n"); int nsteps = update->nsteps; double time3d; int nsample = 1; int nfft = force->kspace->timing_3d(nsample,time3d); while (time3d < 1.0) { nsample *= 2; nfft = force->kspace->timing_3d(nsample,time3d); } time3d = nsteps * time3d / nsample; MPI_Allreduce(&time3d,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time3d = tmp/nprocs; double time1d; nsample = 1; nfft = force->kspace->timing_1d(nsample,time1d); while (time1d < 1.0) { nsample *= 2; nfft = force->kspace->timing_1d(nsample,time1d); } time1d = nsteps * time1d / nsample; MPI_Allreduce(&time1d,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time1d = tmp/nprocs; double time_kspace = timer->get_wall(Timer::KSPACE); MPI_Allreduce(&time_kspace,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time_kspace = tmp/nprocs; double ntotal = 1.0 * force->kspace->nx_pppm * force->kspace->ny_pppm * force->kspace->nz_pppm; double nflops = 5.0 * ntotal * log(ntotal); double fraction,flop3,flop1; if (nsteps) { if (time_kspace) fraction = time3d/time_kspace*100.0; else fraction = 0.0; flop3 = nfft*nflops/1.0e9/(time3d/nsteps); flop1 = nfft*nflops/1.0e9/(time1d/nsteps); } else fraction = flop3 = flop1 = 0.0; if (me == 0) utils::logmesg(lmp,fmt::format("FFT time (% of Kspce) = {:.6} ({:.4})\n" "FFT Gflps 3d (1d only) = {:.8} {:.8}\n", time3d,fraction,flop3,flop1)); } if (histoflag) { std::string mesg = "\n"; tmp = atom->nlocal; stats(1,&tmp,&ave,&max,&min,10,histo); if (me == 0) { mesg += fmt::format("Nlocal: {:11.6} ave {:11.6g} max {:11.6g} min\n",ave,max,min); mesg += "Histogram:"; for (i = 0; i < 10; i++) mesg += fmt::format(" {}",histo[i]); mesg += "\n"; } tmp = atom->nghost; stats(1,&tmp,&ave,&max,&min,10,histo); if (me == 0) { mesg += fmt::format("Nghost: {:11.6} ave {:11.6g} max {:11.6g} min\n",ave,max,min); mesg += "Histogram:"; for (i = 0; i < 10; i++) mesg += fmt::format(" {}",histo[i]); mesg += "\n"; } // find a non-skip neighbor list containing half pairwise interactions // count neighbors in that list for stats purposes // allow it to be Kokkos neigh list as well for (m = 0; m < neighbor->old_nrequest; m++) if (neighbor->old_requests[m]->half && neighbor->old_requests[m]->skip == 0 && neighbor->lists[m] && neighbor->lists[m]->numneigh) break; nneigh = 0; if (m < neighbor->old_nrequest) { if (!neighbor->lists[m]->kokkos) { int inum = neighbor->lists[m]->inum; int *ilist = neighbor->lists[m]->ilist; int *numneigh = neighbor->lists[m]->numneigh; for (i = 0; i < inum; i++) nneigh += numneigh[ilist[i]]; } else if (lmp->kokkos) nneigh = lmp->kokkos->neigh_count(m); } tmp = nneigh; stats(1,&tmp,&ave,&max,&min,10,histo); if (me == 0) { mesg += fmt::format("Neighs: {:11.6} ave {:11.6g} max {:11.6g} min\n",ave,max,min); mesg += "Histogram:"; for (i = 0; i < 10; i++) mesg += fmt::format(" {}",histo[i]); mesg += "\n"; } // find a non-skip neighbor list containing full pairwise interactions // count neighbors in that list for stats purposes // allow it to be Kokkos neigh list as well for (m = 0; m < neighbor->old_nrequest; m++) if (neighbor->old_requests[m]->full && neighbor->old_requests[m]->skip == 0) break; nneighfull = 0; if (m < neighbor->old_nrequest) { if (!neighbor->lists[m]->kokkos && neighbor->lists[m]->numneigh) { int inum = neighbor->lists[m]->inum; int *ilist = neighbor->lists[m]->ilist; int *numneigh = neighbor->lists[m]->numneigh; for (i = 0; i < inum; i++) nneighfull += numneigh[ilist[i]]; } else if (lmp->kokkos) nneighfull = lmp->kokkos->neigh_count(m); tmp = nneighfull; stats(1,&tmp,&ave,&max,&min,10,histo); if (me == 0) { mesg += fmt::format("FullNghs: {:11.6} ave {:11.6g} max {:11.6g} min\n",ave,max,min); mesg += "Histogram:"; for (i = 0; i < 10; i++) mesg += fmt::format(" {}",histo[i]); mesg += "\n"; } } if (me == 0) utils::logmesg(lmp,mesg); } if (neighflag) { if (me == 0) utils::logmesg(lmp,"\n"); tmp = MAX(nneigh,nneighfull); double nall; MPI_Allreduce(&tmp,&nall,1,MPI_DOUBLE,MPI_SUM,world); int nspec; double nspec_all = 0; if (atom->molecular == Atom::MOLECULAR) { int **nspecial = atom->nspecial; int nlocal = atom->nlocal; nspec = 0; for (i = 0; i < nlocal; i++) nspec += nspecial[i][2]; tmp = nspec; MPI_Allreduce(&tmp,&nspec_all,1,MPI_DOUBLE,MPI_SUM,world); } else if (atom->molecular == Atom::TEMPLATE) { Molecule **onemols = atom->avec->onemols; int *molindex = atom->molindex; int *molatom = atom->molatom; int nlocal = atom->nlocal; int imol,iatom; nspec = 0; for (i = 0; i < nlocal; i++) { if (molindex[i] < 0) continue; imol = molindex[i]; iatom = molatom[i]; nspec += onemols[imol]->nspecial[iatom][2]; } tmp = nspec; MPI_Allreduce(&tmp,&nspec_all,1,MPI_DOUBLE,MPI_SUM,world); } if (me == 0) { std::string mesg; mesg += fmt::format("Total # of neighbors = {:.8g}\n",nall); if (atom->natoms > 0) mesg += fmt::format("Ave neighs/atom = {:.8}\n",nall/atom->natoms); if ((atom->molecular != Atom::ATOMIC) && (atom->natoms > 0)) mesg += fmt::format("Ave special neighs/atom = {:.8}\n", nspec_all/atom->natoms); mesg += fmt::format("Neighbor list builds = {}\n",neighbor->ncalls); if (neighbor->dist_check) mesg += fmt::format("Dangerous builds = {}\n",neighbor->ndanger); else mesg += "Dangerous builds not checked\n"; utils::logmesg(lmp,mesg); } } if (logfile) fflush(logfile); } /* ---------------------------------------------------------------------- */ void Finish::stats(int n, double *data, double *pave, double *pmax, double *pmin, int nhisto, int *histo) { int i,m; int *histotmp; double min = 1.0e20; double max = -1.0e20; double ave = 0.0; for (i = 0; i < n; i++) { ave += data[i]; if (data[i] < min) min = data[i]; if (data[i] > max) max = data[i]; } int ntotal; MPI_Allreduce(&n,&ntotal,1,MPI_INT,MPI_SUM,world); double tmp; MPI_Allreduce(&ave,&tmp,1,MPI_DOUBLE,MPI_SUM,world); ave = tmp/ntotal; MPI_Allreduce(&min,&tmp,1,MPI_DOUBLE,MPI_MIN,world); min = tmp; MPI_Allreduce(&max,&tmp,1,MPI_DOUBLE,MPI_MAX,world); max = tmp; for (i = 0; i < nhisto; i++) histo[i] = 0; double del = max - min; for (i = 0; i < n; i++) { if (del == 0.0) m = 0; else m = static_cast ((data[i]-min)/del * nhisto); if (m > nhisto-1) m = nhisto-1; histo[m]++; } memory->create(histotmp,nhisto,"finish:histotmp"); MPI_Allreduce(histo,histotmp,nhisto,MPI_INT,MPI_SUM,world); for (i = 0; i < nhisto; i++) histo[i] = histotmp[i]; memory->destroy(histotmp); *pave = ave; *pmax = max; *pmin = min; } /* ---------------------------------------------------------------------- */ void mpi_timings(const char *label, Timer *t, enum Timer::ttype tt, MPI_Comm world, const int nprocs, const int nthreads, const int me, double time_loop, FILE *scr, FILE *log) { double tmp, time_max, time_min, time_sq; double time = t->get_wall(tt); double time_cpu = t->get_cpu(tt); if (time/time_loop < 0.001) // insufficient timer resolution! time_cpu = 1.0; else time_cpu = time_cpu / time; if (time_cpu > nthreads) time_cpu = nthreads; MPI_Allreduce(&time,&time_min,1,MPI_DOUBLE,MPI_MIN,world); MPI_Allreduce(&time,&time_max,1,MPI_DOUBLE,MPI_MAX,world); time_sq = time*time; MPI_Allreduce(&time,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time = tmp/nprocs; MPI_Allreduce(&time_sq,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time_sq = tmp/nprocs; MPI_Allreduce(&time_cpu,&tmp,1,MPI_DOUBLE,MPI_SUM,world); time_cpu = tmp/nprocs*100.0; // % variance from the average as measure of load imbalance if ((time > 0.001) && ((time_sq/time - time) > 1.0e-10)) time_sq = sqrt(time_sq/time - time)*100.0; else time_sq = 0.0; if (me == 0) { tmp = time/time_loop*100.0; std::string mesg; if (t->has_full()) mesg = fmt::format("{:<8s}| {:<10.5g} | {:<10.5g} | {:<10.5g} |{:6.1f} |" "{:6.1f} |{:6.2f}\n", label,time_min,time,time_max,time_sq,time_cpu,tmp); else mesg = fmt::format("{:<8s}| {:<10.5g} | {:<10.5g} | {:<10.5g} |{:6.1f} |" "{:6.2f}\n",label,time_min,time,time_max,time_sq,tmp); if (scr) fputs(mesg.c_str(),scr); if (log) fputs(mesg.c_str(),log); } } /* ---------------------------------------------------------------------- */ #ifdef LMP_USER_OMP void omp_times(FixOMP *fix, const char *label, enum Timer::ttype which, const int nthreads,FILE *scr, FILE *log) { double time_min, time_max, time_avg, time_total, time_std; time_min = 1.0e100; time_max = -1.0e100; time_total = time_avg = time_std = 0.0; for (int i=0; i < nthreads; ++i) { ThrData *thr = fix->get_thr(i); double tmp=thr->get_time(which); time_min = MIN(time_min,tmp); time_max = MAX(time_max,tmp); time_avg += tmp; time_std += tmp*tmp; time_total += thr->get_time(Timer::ALL); } time_avg /= nthreads; time_std /= nthreads; time_total /= nthreads; if ((time_avg > 0.001) && ((time_std/time_avg -time_avg) > 1.0e-10)) time_std = sqrt(time_std/time_avg - time_avg)*100.0; else time_std = 0.0; std::string mesg = fmt::format("{:<8s}| {:10.5g} | {:10.5g} | {:10.5g} |" "{:6.1f} |{:6.2f}\n",label,time_min,time_avg, time_max,time_std,time_avg/time_total*100.0); if (scr) fputs(mesg.c_str(),scr); if (log) fputs(mesg.c_str(),log); } #endif