add option to synchronize fix halt stop requests across multi-partition runs

This commit is contained in:
Axel Kohlmeyer
2025-03-07 11:40:27 -05:00
parent 2ff078f973
commit 6e94d83cff
3 changed files with 93 additions and 13 deletions

View File

@ -25,13 +25,14 @@ Syntax
* operator = "<" or "<=" or ">" or ">=" or "==" or "!=" or "\|\^"
* avalue = numeric value to compare attribute to
* zero or more keyword/value pairs may be appended
* keyword = *error* or *message* or *path*
* keyword = *error* or *message* or *path* or *universe*
.. parsed-literal::
*error* value = *hard* or *soft* or *continue*
*message* value = *yes* or *no*
*path* value = path to check for free space (may be in quotes)
*universe* value = *yes* or *no*
Examples
@ -40,8 +41,10 @@ Examples
.. code-block:: LAMMPS
fix 10 all halt 1 bondmax > 1.5
fix 10 all halt 10 v_myCheck != 0 error soft
fix 10 all halt 10 v_myCheck != 0 error soft message no
fix 10 all halt 100 diskfree < 100000.0 path "dump storage/."
fix 2 all halt 100 v_curtime > ${maxtime} universe yes
Description
"""""""""""
@ -162,12 +165,21 @@ is printed; the run simply exits. The latter may be desirable for
post-processing tools that extract thermodynamic information from log
files.
The optional *universe* keyword determines whether the halt request
should be synchronized across the partitions of a :doc:`multi-partition
run <Run_options>`. If *universe* is set to yes, fix halt will check if
there is a specific message received from any of the other partitions
requesting to stop the run on this partition as well. Consequently, if
fix halt determines to halt the simulation, the fix will send messages
to all other partitions so they stop their runs, too.
Restart, fix_modify, output, run start/stop, minimize info
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
No information about this fix is written to :doc:`binary restart files <restart>`. None of the :doc:`fix_modify <fix_modify>` options
are relevant to this fix. No global or per-atom quantities are stored
by this fix for access by various :doc:`output commands <Howto_output>`.
No information about this fix is written to :doc:`binary restart files
<restart>`. None of the :doc:`fix_modify <fix_modify>` options are
relevant to this fix. No global or per-atom quantities are stored by
this fix for access by various :doc:`output commands <Howto_output>`.
No parameter of this fix can be used with the *start/stop* keywords of
the :doc:`run <run>` command.
@ -183,4 +195,4 @@ Related commands
Default
"""""""
The option defaults are error = soft, message = yes, and path = ".".
The option defaults are error = soft, message = yes, path = ".", and universe = no.

View File

@ -22,6 +22,7 @@
#include "neighbor.h"
#include "timer.h"
#include "update.h"
#include "universe.h"
#include "variable.h"
#include <cmath>
@ -34,6 +35,7 @@ enum { BONDMAX, TLIMIT, DISKFREE, VARIABLE };
enum { LT, LE, GT, GE, EQ, NEQ, XOR };
enum { HARD, SOFT, CONTINUE };
enum { NOMSG = 0, YESMSG = 1 };
static constexpr int UTAG = 999;
/* ---------------------------------------------------------------------- */
@ -42,11 +44,10 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :
{
if (narg < 7) utils::missing_cmd_args(FLERR, "fix halt", error);
nevery = utils::inumeric(FLERR, arg[3], false, lmp);
if (nevery <= 0) error->all(FLERR, "Illegal fix halt command: nevery must be > 0");
if (nevery <= 0) error->all(FLERR, 3, "Illegal fix halt command: nevery must be > 0");
// comparison args
idvar = nullptr;
int iarg = 4;
if (strcmp(arg[iarg], "tlimit") == 0) {
@ -56,20 +57,22 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :
dlimit_path = utils::strdup(".");
} else if (strcmp(arg[iarg], "bondmax") == 0) {
attribute = BONDMAX;
} else {
} else if (utils::strmatch(arg[iarg], "^v_")) {
ArgInfo argi(arg[iarg], ArgInfo::VARIABLE);
if ((argi.get_type() == ArgInfo::UNKNOWN) || (argi.get_type() == ArgInfo::NONE) ||
(argi.get_dim() != 0))
error->all(FLERR, "Invalid fix halt attribute {}", arg[iarg]);
error->all(FLERR, iarg, "Invalid fix halt attribute {}", arg[iarg]);
attribute = VARIABLE;
idvar = argi.copy_name();
ivar = input->variable->find(idvar);
if (ivar < 0) error->all(FLERR, "Could not find fix halt variable name");
if (ivar < 0) error->all(FLERR, iarg, "Could not find fix halt variable name {}", idvar);
if (input->variable->equalstyle(ivar) == 0)
error->all(FLERR, "Fix halt variable is not equal-style variable");
error->all(FLERR, iarg, "Fix halt variable is not equal-style variable");
} else {
error->all(FLERR, iarg, "Unknown fix halt keyword {}", arg[iarg]);
}
// clang-format off
@ -90,6 +93,7 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :
eflag = SOFT;
msgflag = YESMSG;
uflag = NOMSG;
++iarg;
while (iarg < narg) {
if (strcmp(arg[iarg], "error") == 0) {
@ -103,6 +107,10 @@ FixHalt::FixHalt(LAMMPS *lmp, int narg, char **arg) :
if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix halt message", error);
msgflag = utils::logical(FLERR, arg[iarg + 1], false, lmp);
iarg += 2;
} else if (strcmp(arg[iarg], "universe") == 0) {
if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix halt universe", error);
uflag = utils::logical(FLERR, arg[iarg + 1], false, lmp);
iarg += 2;
} else if (strcmp(arg[iarg], "path") == 0) {
if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix halt error", error);
++iarg;
@ -189,6 +197,50 @@ void FixHalt::min_post_force(int /* vflag */)
void FixHalt::end_of_step()
{
// check if another partition has exited and we need to exit, too.
if (uflag) {
MPI_Status status;
int partition = -1;
int flag = 0;
if (comm->me == 0) {
// probe if any stop request from another partition is pending
MPI_Iprobe(MPI_ANY_SOURCE, UTAG, universe->uworld, &flag, &status);
if (flag) {
// determine which partition sent the stop request and receive the message
for (int i = 0; i < universe->nworlds; ++i)
if (universe->root_proc[i] == status.MPI_SOURCE) partition = i + 1;
MPI_Recv(&flag, 1, MPI_INT, status.MPI_SOURCE, UTAG, universe->uworld, MPI_STATUS_IGNORE);
}
}
// broadcast stop request partition to all processes in our partition
MPI_Bcast(&partition, 1, MPI_INT, 0, world);
// exit request pending handle the same as below
if (partition > 0) {
// hard halt -> exit LAMMPS
// soft/continue halt -> trigger timer to break from run loop
// print message with ID of fix halt in case multiple instances
auto message = fmt::format("Received universe halt request from partition {} for fix-id {} on step {}",
partition, id, update->ntimestep);
if (eflag == HARD) {
error->all(FLERR, message);
} else if ((eflag == SOFT) || (eflag == CONTINUE)) {
if ((comm->me == 0) && (msgflag == YESMSG)) error->message(FLERR, message);
timer->force_timeout();
}
}
}
// variable evaluation may invoke computes so wrap with clear/add
double attvalue;
@ -228,6 +280,22 @@ void FixHalt::end_of_step()
if ((attvalue == 0.0 && value == 0.0) || (attvalue != 0.0 && value != 0.0)) return;
}
// send message to all other root processes to trigger exit across universe, if requested
if (uflag && (comm->me == 0)) {
MPI_Request *req = new MPI_Request[universe->nworlds];
for (int i = 0; i < universe->nworlds; ++i) {
if (universe->me == universe->root_proc[i]) continue;
MPI_Isend(&eflag, 1, MPI_INT, universe->root_proc[i], UTAG, universe->uworld, req + i);
}
// wait for all sends to complete, so MPI_Finalize() will be happy
for (int i = 0; i < universe->nworlds; ++i) {
if (universe->me == universe->root_proc[i]) continue;
MPI_Wait(req + i, MPI_STATUS_IGNORE);
}
}
// hard halt -> exit LAMMPS
// soft/continue halt -> trigger timer to break from run loop
// print message with ID of fix halt in case multiple instances

View File

@ -35,7 +35,7 @@ class FixHalt : public Fix {
void post_run() override;
private:
int attribute, operation, eflag, msgflag, ivar;
int attribute, operation, eflag, msgflag, ivar, uflag;
bigint nextstep, thisstep;
double value, tratio;
char *idvar;