diff --git a/doc/timer.txt b/doc/timer.txt index d465e450f6..42b3d1e73c 100644 --- a/doc/timer.txt +++ b/doc/timer.txt @@ -12,17 +12,20 @@ timer command :h3 timer args :pre -{args} = one or more of {off} or {loop} or {normal} or {full} or {sync} or {nosync} :l +{args} = one or more of {off} or {loop} or {normal} or {full} or {sync} or {nosync} or {timeout} or {every} :l {off} = do not collect or print any timing information {loop} = collect only the total time for the simulation loop {normal} = collect timer information broken down by sections (default) {full} = like {normal} but also include CPU and thread utilzation {sync} = explicitly synchronize MPI tasks between sections - {nosync} = do not synchronize MPI tasks between sections (default) :pre + {nosync} = do not synchronize MPI tasks between sections (default) + {timeout} value = set walltime limit to {value} + {every} value = perform timeout check every {value} steps :pre [Examples:] timer full sync +timer timeout 2:00:00 every 100 timer loop :pre [Description:] @@ -50,6 +53,20 @@ call which meaures load imbalance more accuractly, though it can also slow down the simulation. Using the {nosync} setting (which is the default) turns off this synchronization. +With the {timeout} keyword a walltime limit can be imposed that affects +"run"_run.html and "minimize"_minimize.html commands. If the time +limit it reached, ongoing calculations will be stopped on the next +step that is a multiple of the value specified with {every}. All +follwing run or minimize commands will be skipped until the timeout +is reset or turned off. The timeout value can be "off" or "unlimited" +to turn the timeout off, otherwise a single number is interpreted as +seconds, two numbers separated by a colon (MM:SS) as minutes and +seconds and three numbers separated by colons as hours, minutes and +seconds, respectively. The {every} keyword sets how frequent the +wall clock will be checked. Checking for timeout very often, can +slow a calculation down, checking too infrequent makes the timeout +measurement less accurate. + Multiple keywords can be specified. For keywords that are mutually exclusive, the last one specified takes effect. @@ -71,3 +88,5 @@ can just use the {loop} or {off} setting. [Default:] timer normal nosync +timer timeout off +timer every 10 :pre diff --git a/src/USER-OMP/respa_omp.cpp b/src/USER-OMP/respa_omp.cpp index f5603adf92..c81184b6a9 100644 --- a/src/USER-OMP/respa_omp.cpp +++ b/src/USER-OMP/respa_omp.cpp @@ -69,22 +69,13 @@ void RespaOMP::init() void RespaOMP::setup() { + timer->init_timeout(); if (comm->me == 0 && screen) { fprintf(screen,"Setting up r-RESPA/omp run ...\n"); fprintf(screen," Unit style : %s\n", update->unit_style); fprintf(screen," Current step : " BIGINT_FORMAT "\n", update->ntimestep); fprintf(screen," OuterTime step: %g\n", update->dt); - if (update->max_wall > 0) { - char outtime[128]; - double totalclock = update->max_wall; - int seconds = fmod(totalclock,60.0); - totalclock = (totalclock - seconds) / 60.0; - int minutes = fmod(totalclock,60.0); - int hours = (totalclock - minutes) / 60.0; - sprintf(outtime," Max walltime: " - "%d:%02d:%02d\n", hours, minutes, seconds); - fputs(outtime,screen); - } + timer->print_timeout(screen); } update->setupflag = 1; diff --git a/src/respa.cpp b/src/respa.cpp index 0baf713a3b..67db8b2913 100644 --- a/src/respa.cpp +++ b/src/respa.cpp @@ -390,22 +390,13 @@ void Respa::init() void Respa::setup() { + timer->init_timeout(); if (comm->me == 0 && screen) { fprintf(screen,"Setting up r-RESPA run ...\n"); fprintf(screen," Unit style : %s\n", update->unit_style); fprintf(screen," Current step : " BIGINT_FORMAT "\n", update->ntimestep); fprintf(screen," OuterTime step: %g\n", update->dt); - if (update->max_wall > 0) { - char outtime[128]; - double totalclock = update->max_wall; - int seconds = fmod(totalclock,60.0); - totalclock = (totalclock - seconds) / 60.0; - int minutes = fmod(totalclock,60.0); - int hours = (totalclock - minutes) / 60.0; - sprintf(outtime," Max walltime: " - "%d:%02d:%02d\n", hours, minutes, seconds); - fputs(outtime,screen); - } + timer->print_timeout(screen); } update->setupflag = 1; @@ -559,7 +550,7 @@ void Respa::run(int n) bigint ntimestep; for (int i = 0; i < n; i++) { - if (update->time_expired()) { + if (timer->check_timeout(i)) { update->nsteps = i; return; } diff --git a/src/run.cpp b/src/run.cpp index 450cad638a..5381bf2db7 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -42,6 +42,9 @@ void Run::command(int narg, char **arg) if (domain->box_exist == 0) error->all(FLERR,"Run command before simulation box is defined"); + // ignore run command, if walltime limit was already reached + if (timer->is_timeout()) return; + bigint nsteps_input = force->bnumeric(FLERR,arg[0]); // parse optional args @@ -55,7 +58,6 @@ void Run::command(int narg, char **arg) int nevery = 0; int ncommands = 0; int first,last; - double timelimit = -1.0; int iarg = 1; while (iarg < narg) { @@ -73,14 +75,6 @@ void Run::command(int narg, char **arg) stopflag = 1; stop = force->bnumeric(FLERR,arg[iarg+1]); iarg += 2; - } else if (strcmp(arg[iarg],"max_hours") == 0) { - if (iarg+2 > narg) error->all(FLERR,"Illegal run command"); - if (strcmp(arg[iarg+1],"unlimited") == 0) timelimit = -1.0; - else { - timelimit = 3600.0*force->numeric(FLERR,arg[iarg+1]); - if (timelimit <= 0.0) error->all(FLERR,"Illegal run command"); - } - iarg += 2; } else if (strcmp(arg[iarg],"pre") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal run command"); if (strcmp(arg[iarg+1],"no") == 0) preflag = 0; @@ -161,7 +155,6 @@ void Run::command(int narg, char **arg) // if post, do full Finish, else just print time update->whichflag = 1; - update->max_wall = timelimit; if (nevery == 0) { update->nsteps = nsteps; @@ -200,6 +193,8 @@ void Run::command(int narg, char **arg) int iter = 0; int nleft = nsteps; while (nleft > 0 || iter == 0) { + if (timer->is_timeout()) break; + nsteps = MIN(nleft,nevery); update->nsteps = nsteps; diff --git a/src/timer.cpp b/src/timer.cpp index 2fb8453684..6d421761ce 100644 --- a/src/timer.cpp +++ b/src/timer.cpp @@ -16,6 +16,7 @@ #include "timer.h" #include "comm.h" #include "error.h" +#include "force.h" #include "memory.h" #ifdef _WIN32 @@ -26,8 +27,38 @@ #include #endif +#include +#include + using namespace LAMMPS_NS; +// convert a timespec ([[HH:]MM:]SS) to seconds +// the strings "off" and "unlimited" result in -1; + +static double timespec2seconds(char *timespec) +{ + double vals[3]; + char *num; + int i = 0; + + // first handle allowed textual inputs + if (strcmp(timespec,"off") == 0) return -1; + if (strcmp(timespec,"unlimited") == 0) return -1; + + vals[0] = vals[1] = vals[2] = 0; + + num = strtok(timespec,":"); + while ((num != NULL) && (i < 3)) { + vals[i] = atoi(num); + ++i; + num = strtok(NULL,":"); + } + + if (i == 3) return (vals[0]*60 + vals[1])*60 + vals[2]; + else if (i == 2) return vals[0]*60 + vals[1]; + else return vals[0]; +} + // Return the CPU time for the current process in seconds very // much in the same way as MPI_Wtime() returns the wall time. @@ -64,6 +95,9 @@ Timer::Timer(LAMMPS *lmp) : Pointers(lmp) { _level = NORMAL; _sync = OFF; + _timeout = -1.0; + _checkfreq = 10; + _nextcheck = -1; } /* ---------------------------------------------------------------------- */ @@ -176,12 +210,57 @@ void Timer::set_wall(enum ttype which, double newtime) wall_array[which] = newtime; } +/* ---------------------------------------------------------------------- */ + +void Timer::init_timeout() +{ + if (_timeout < 0) + _nextcheck = -1; + else + _nextcheck = _checkfreq; +} + +/* ---------------------------------------------------------------------- */ + +void Timer::print_timeout(FILE *fp) +{ + if (!fp) return; + + // format timeout setting + if (_timeout > 0) { + char timebuf[32]; + time_t tv = _timeout - (MPI_Wtime() - timeout_start); + struct tm *tm = gmtime(&tv); + strftime(timebuf,32,"%H:%M:%S",tm); + fprintf(fp," Walltime left: %s\n",timebuf); + } +} + +/* ---------------------------------------------------------------------- */ + +bool Timer::_check_timeout() +{ + double walltime = MPI_Wtime() - timeout_start; + // broadcast time to insure all ranks act the same. + MPI_Bcast(&walltime,1,MPI_DOUBLE,0,world); + + if (walltime < _timeout) { + _nextcheck += _checkfreq; + return false; + } else { + if (comm->me == 0) + error->warning(FLERR,"Wall time limit reached"); + _timeout = 0.0; + return true; + } +} + /* ---------------------------------------------------------------------- modify parameters of the Timer class ------------------------------------------------------------------------- */ static const char *timer_style[] = { "off", "loop", "normal", "full" }; static const char *timer_mode[] = { "nosync", "(dummy)", "sync" }; -static const char timer_fmt[] = "New timer settings: style=%s mode=%s\n"; +static const char timer_fmt[] = "New timer settings: style=%s mode=%s timeout=%s\n"; void Timer::modify_params(int narg, char **arg) { @@ -199,14 +278,37 @@ void Timer::modify_params(int narg, char **arg) _sync = OFF; } else if (strcmp(arg[iarg],timer_mode[NORMAL]) == 0) { _sync = NORMAL; + } else if (strcmp(arg[iarg],"timeout") == 0) { + ++iarg; + if (iarg < narg) { + _timeout = timespec2seconds(arg[iarg]); + } else error->all(FLERR,"Illegal timers command"); + } else if (strcmp(arg[iarg],"every") == 0) { + ++iarg; + if (iarg < narg) { + _checkfreq = force->inumeric(FLERR,arg[iarg]); + if (_checkfreq <= 0) + error->all(FLERR,"Illegal timers command"); + } else error->all(FLERR,"Illegal timers command"); } else error->all(FLERR,"Illegal timers command"); ++iarg; } + timeout_start = MPI_Wtime(); if (comm->me == 0) { + + // format timeout setting + char timebuf[32]; + if (_timeout < 0) strcpy(timebuf,"off"); + else { + time_t tv = _timeout; + struct tm *tm = gmtime(&tv); + strftime(timebuf,32,"%H:%M:%S",tm); + } + if (screen) - fprintf(screen,timer_fmt,timer_style[_level],timer_mode[_sync]); + fprintf(screen,timer_fmt,timer_style[_level],timer_mode[_sync],timebuf); if (logfile) - fprintf(logfile,timer_fmt,timer_style[_level],timer_mode[_sync]); + fprintf(logfile,timer_fmt,timer_style[_level],timer_mode[_sync],timebuf); } } diff --git a/src/timer.h b/src/timer.h index adf554f468..3e2c88868f 100644 --- a/src/timer.h +++ b/src/timer.h @@ -36,7 +36,7 @@ class Timer : protected Pointers { void stamp(enum ttype which=START) { if (_level > LOOP) _stamp(which); } - + void barrier_start(); void barrier_stop(); @@ -47,6 +47,9 @@ class Timer : protected Pointers { bool has_full() const { return (_level >= FULL); } bool has_sync() const { return (_sync != OFF); } + // flag if wallclock time is expired + bool is_timeout() const { return (_timeout == 0.0); } + double elapsed(enum ttype); double cpu(enum ttype); @@ -57,6 +60,21 @@ class Timer : protected Pointers { void set_wall(enum ttype, double); + // initialize timeout timer + void init_timeout(); + + // trigger enforced timeout + void force_timeout() { _timeout = 0.0; }; + + // print timeout message + void print_timeout(FILE *); + + // check for timeout. inline wrapper around internal + // function to reduce overhead in case there is no check. + bool check_timeout(int step) { + if (_nextcheck != step) return false; + else return _check_timeout(); + } void modify_params(int, char **); @@ -65,11 +83,18 @@ class Timer : protected Pointers { double wall_array[NUM_TIMER]; double previous_cpu; double previous_wall; - int _level; // level of detail: off=0,loop=1,normal=2,full=3 - int _sync; // if nonzero, synchronize tasks before setting the timer + double timeout_start; + int _level; // level of detail: off=0,loop=1,normal=2,full=3 + int _sync; // if nonzero, synchronize tasks before setting the timer + int _timeout; // max allowed wall time in seconds. infinity if negative + int _checkfreq; // frequency of timeout checking + int _nextcheck; // timestep number of next timeout check - // update requested timer array + // update one specific timer array void _stamp(enum ttype); + + // check for timeout + bool _check_timeout(); }; } diff --git a/src/update.cpp b/src/update.cpp index 879796edf9..78f4fe73a3 100644 --- a/src/update.cpp +++ b/src/update.cpp @@ -42,7 +42,6 @@ Update::Update(LAMMPS *lmp) : Pointers(lmp) atime = 0.0; atimestep = 0; first_update = 0; - max_wall = -1.0; whichflag = 0; firststep = laststep = 0; @@ -96,9 +95,6 @@ void Update::init() if (strstr(minimize_style,"cuda") == NULL) error->all(FLERR,"USER-CUDA mode requires CUDA variant of min style"); - if (max_wall > 0.0 && !timer->has_loop()) - error->warning(FLERR,"Wall time limit ignored with 'timer off'"); - // init the appropriate integrate and/or minimize class // if neither (e.g. from write_restart) then just return @@ -485,25 +481,6 @@ void Update::update_time() atimestep = ntimestep; } -/* ---------------------------------------------------------------------- - return 1, if walltime limit has expired. return 0 otherwise. - called at the beginning runs -------------------------------------------------------------------------- */ - -int Update::time_expired() -{ - if (max_wall < 0) return 0; - - int flag = 0; - double totaltime = MPI_Wtime() - timer->get_wall(Timer::TOTAL); - if (timer->has_loop() && (totaltime > max_wall)) - flag = 1; - - // we have to make certain, that flag is set consistently. - MPI_Bcast(&flag,1,MPI_INT,0,world); - return flag; -} - /* ---------------------------------------------------------------------- memory usage of update and integrate/minimize ------------------------------------------------------------------------- */ diff --git a/src/update.h b/src/update.h index f821df105e..38bf1b7db9 100644 --- a/src/update.h +++ b/src/update.h @@ -25,7 +25,6 @@ class Update : protected Pointers { bigint ntimestep; // current step (dynamics or min iterations) int nsteps; // # of steps to run (dynamics or min iter) int whichflag; // 0 for unset, 1 for dynamics, 2 for min - double max_wall; // walltime limit in seconds; < 0 for unlimited double atime; // simulation time at atime_step bigint atimestep; // last timestep atime was updated bigint firststep,laststep; // 1st & last step of this run @@ -56,7 +55,6 @@ class Update : protected Pointers { void reset_timestep(int, char **); void reset_timestep(bigint); void update_time(); - int time_expired(); bigint memory_usage(); private: diff --git a/src/verlet.cpp b/src/verlet.cpp index 00e2dedc8c..c184973681 100644 --- a/src/verlet.cpp +++ b/src/verlet.cpp @@ -92,17 +92,7 @@ void Verlet::setup() fprintf(screen," Unit style : %s\n", update->unit_style); fprintf(screen," Current step: " BIGINT_FORMAT "\n", update->ntimestep); fprintf(screen," Time step : %g\n", update->dt); - if (update->max_wall > 0) { - char outtime[128]; - double totalclock = update->max_wall; - int seconds = fmod(totalclock,60.0); - totalclock = (totalclock - seconds) / 60.0; - int minutes = fmod(totalclock,60.0); - int hours = (totalclock - minutes) / 60.0; - sprintf(outtime," Max walltime: " - "%d:%02d:%02d\n", hours, minutes, seconds); - fputs(outtime,screen); - } + timer->print_timeout(screen); } update->setupflag = 1; @@ -238,8 +228,9 @@ void Verlet::run(int n) if (atom->sortfreq > 0) sortflag = 1; else sortflag = 0; + timer->init_timeout(); for (int i = 0; i < n; i++) { - if (update->time_expired()) { + if (timer->check_timeout(i)) { update->nsteps = i; return; }