ENH: allow disabling of initial MPI_Comm_dup(MPI_COMM_WORLD,...)

- can use -mpi-no-comm-dup to suppress the initial communicator
  duplication (to avoid potential deadlock with coupled processes).
  This is partly related to comments in merge-request !735

ENH: simplify parsing/removal of local -world option

- can extract the world name in a single pass and also makes the
  parsing robuster.

ENH: support regular MPI_Comm_split()

- the two-step method of Allgather + Comm_create_group may not be
  expected by other applications (issue #3127) and that can lead to
  deadlock, so also add in code for the regular MPI_Comm_split.
  Does not support re-sorting keys!

FIX: faulty logic for splitting communicators

- only affected more recent develop branch
This commit is contained in:
Mark Olesen
2025-04-30 09:59:59 +02:00
parent c5ceec3c73
commit 34143b433a
9 changed files with 222 additions and 68 deletions

View File

@ -1,3 +1,3 @@
Test-parallel-comm2.C Test-parallel-comm2.cxx
EXE = $(FOAM_USER_APPBIN)/Test-parallel-comm2 EXE = $(FOAM_USER_APPBIN)/Test-parallel-comm2

View File

@ -69,6 +69,7 @@ int main(int argc, char *argv[])
argList::addBoolOption("info", "information"); argList::addBoolOption("info", "information");
argList::addBoolOption("print-tree", "Report tree(s) as graph"); argList::addBoolOption("print-tree", "Report tree(s) as graph");
argList::addBoolOption("no-test", "Disable general tests"); argList::addBoolOption("no-test", "Disable general tests");
argList::addBoolOption("split", "Test Pstream split-comm");
argList::addBoolOption("host-comm", "Test Pstream host-comm"); argList::addBoolOption("host-comm", "Test Pstream host-comm");
argList::addBoolOption("host-broadcast", "Test host-base broadcasts"); argList::addBoolOption("host-broadcast", "Test host-base broadcasts");
@ -85,8 +86,8 @@ int main(int argc, char *argv[])
if (UPstream::parRun() && optPrintTree) if (UPstream::parRun() && optPrintTree)
{ {
Info<< "comms: " // Info<< "comms: "
<< UPstream::whichCommunication(UPstream::worldComm) << nl; // << UPstream::whichCommunication(UPstream::worldComm) << nl;
UPstream::printCommTree(UPstream::commWorld()); UPstream::printCommTree(UPstream::commWorld());
} }
@ -102,6 +103,34 @@ int main(int argc, char *argv[])
<< flatOutput(UPstream::procID(UPstream::commLocalNode())) << nl; << flatOutput(UPstream::procID(UPstream::commLocalNode())) << nl;
} }
if (UPstream::parRun() && args.found("split"))
{
Info<< "split: alternative ranks" << nl;
const auto myRank = UPstream::myProcNo();
int colour =
(
(myRank == 5 || myRank == 6) // Exclude these ones
? -1
: (myRank % 2)
);
UPstream::communicator comm =
UPstream::communicator::split(UPstream::commWorld(), colour, true);
Pout<< "split ranks (colour=" << colour << ") "
<< flatOutput(UPstream::procID(comm.comm())) << nl;
comm.reset();
comm =
UPstream::communicator::split(UPstream::commWorld(), colour, false);
Pout<< "Split ranks (colour=" << colour << ") "
<< flatOutput(UPstream::procID(comm.comm())) << nl;
}
if (args.found("info")) if (args.found("info"))
{ {
Info<< nl; Info<< nl;
@ -135,8 +164,8 @@ int main(int argc, char *argv[])
<< endl; << endl;
{ {
Info<< "host-master: " // Info<< "host-master: "
<< UPstream::whichCommunication(commInterNode) << endl; // << UPstream::whichCommunication(commInterNode) << endl;
UPstream::printCommTree(commInterNode); UPstream::printCommTree(commInterNode);
UPstream::printCommTree(commLocalNode); UPstream::printCommTree(commLocalNode);

View File

@ -6,7 +6,7 @@
# \\ / A nd | www.openfoam.com # \\ / A nd | www.openfoam.com
# \\/ M anipulation | # \\/ M anipulation |
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Copyright (C) 2017-2023 OpenCFD Ltd. # Copyright (C) 2017-2025 OpenCFD Ltd.
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# License # License
# This file is part of OpenFOAM, distributed under GPL-3.0-or-later. # This file is part of OpenFOAM, distributed under GPL-3.0-or-later.
@ -168,6 +168,7 @@ extractOptions()
-e '/^-doc-source/d; /^-help-man/d;' \ -e '/^-doc-source/d; /^-help-man/d;' \
-e '/^-hostRoots /d; /^-roots /d;' \ -e '/^-hostRoots /d; /^-roots /d;' \
-e '/^-lib /d; /^-no-libs /d;' \ -e '/^-lib /d; /^-no-libs /d;' \
-e '/^-mpi-.*/d;' \
-e '/^-[a-z]*-switch /d;' \ -e '/^-[a-z]*-switch /d;' \
-e 'y/,/ /; s/=.*$/=/;' \ -e 'y/,/ /; s/=.*$/=/;' \
-e '/^-[^ ]* </{ s/^\(-[^ ]* <\).*$/\1/; p; d }' \ -e '/^-[^ ]* </{ s/^\(-[^ ]* <\).*$/\1/; p; d }' \

View File

@ -6,7 +6,7 @@
# \\ / A nd | www.openfoam.com # \\ / A nd | www.openfoam.com
# \\/ M anipulation | # \\/ M anipulation |
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# Copyright (C) 2020-2023 OpenCFD Ltd. # Copyright (C) 2020-2025 OpenCFD Ltd.
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# License # License
# This file is part of OpenFOAM, distributed under GPL-3.0-or-later. # This file is part of OpenFOAM, distributed under GPL-3.0-or-later.
@ -28,6 +28,7 @@ sed -ne '1,/^[Oo]ptions:/d' \
-e '/^-doc-source/d; /^-help-man/d;' \ -e '/^-doc-source/d; /^-help-man/d;' \
-e '/^-hostRoots /d; /^-roots /d;' \ -e '/^-hostRoots /d; /^-roots /d;' \
-e '/^-lib /d; /^-no-libs /d;' \ -e '/^-lib /d; /^-no-libs /d;' \
-e '/^-mpi-.*/d;' \
-e '/^-[a-z]*-switch /d;' \ -e '/^-[a-z]*-switch /d;' \
-e 'y/,/ /; s/=.*$/=/;' \ -e 'y/,/ /; s/=.*$/=/;' \
-e '/^-[^ ]* </{ s/^\(-[^ ]* <\).*$/\1/; p; d }' \ -e '/^-[^ ]* </{ s/^\(-[^ ]* <\).*$/\1/; p; d }' \

View File

@ -445,7 +445,8 @@ Foam::label Foam::UPstream::dupCommunicator
Foam::label Foam::UPstream::splitCommunicator Foam::label Foam::UPstream::splitCommunicator
( (
const label parentIndex, const label parentIndex,
const int colour const int colour,
const bool two_step
) )
{ {
#ifdef FULLDEBUG #ifdef FULLDEBUG
@ -465,7 +466,8 @@ Foam::label Foam::UPstream::splitCommunicator
{ {
Perr<< "Split communicator [" Perr<< "Split communicator ["
<< index << "] from [" << parentIndex << index << "] from [" << parentIndex
<< "] using colour=" << colour << endl; << "] using colour=" << colour
<< " (two_step=" << two_step << ")" << endl;
} }
// Initially treat as unknown, // Initially treat as unknown,
@ -475,7 +477,7 @@ Foam::label Foam::UPstream::splitCommunicator
if (UPstream::parRun()) if (UPstream::parRun())
{ {
splitCommunicatorComponents(parentIndex, index, colour); splitCommunicatorComponents(parentIndex, index, colour, two_step);
} }
return index; return index;
@ -857,6 +859,8 @@ bool Foam::UPstream::parRun_(false);
bool Foam::UPstream::haveThreads_(false); bool Foam::UPstream::haveThreads_(false);
bool Foam::UPstream::noInitialCommDup_(false);
int Foam::UPstream::msgType_(1); int Foam::UPstream::msgType_(1);

View File

@ -397,6 +397,9 @@ private:
//- Have support for threads? //- Have support for threads?
static bool haveThreads_; static bool haveThreads_;
//- Initial MPI_Comm_dup(MPI_COMM_WORLD,...) disabled? (default: false)
static bool noInitialCommDup_;
//- Standard transfer message type //- Standard transfer message type
static int msgType_; static int msgType_;
@ -495,7 +498,9 @@ private:
( (
const label parentIndex, const label parentIndex,
const label index, const label index,
const int colour const int colour,
//! Use MPI_Allgather+MPI_Comm_create_group vs MPI_Comm_split
const bool two_step = true
); );
//- Free MPI components of communicator. //- Free MPI components of communicator.
@ -877,7 +882,10 @@ public:
//! The colouring to select which ranks to include. //! The colouring to select which ranks to include.
//! Negative values correspond to 'ignore' //! Negative values correspond to 'ignore'
const int colour const int colour,
//! Use MPI_Allgather+MPI_Comm_create_group vs MPI_Comm_split
const bool two_step = true
); );
//- Free a previously allocated communicator. //- Free a previously allocated communicator.
@ -949,11 +957,14 @@ public:
const label parentComm, const label parentComm,
//! The colouring to select which ranks to include. //! The colouring to select which ranks to include.
//! Negative values correspond to 'ignore' //! Negative values correspond to 'ignore'
const int colour const int colour,
//! Use MPI_Allgather+MPI_Comm_create_group vs MPI_Comm_split
const bool two_step = true
) )
{ {
communicator c; communicator c;
c.comm_ = UPstream::splitCommunicator(parentComm, colour); c.comm_ =
UPstream::splitCommunicator(parentComm, colour, two_step);
return c; return c;
} }

View File

@ -128,7 +128,14 @@ Foam::argList::initValidTables::initValidTables()
( (
"mpi-threads", "mpi-threads",
"Request use of MPI threads", "Request use of MPI threads",
true // advanced option true // advanced option
);
argList::addBoolOption
(
"mpi-no-comm-dup",
"Disable initial MPI_Comm_dup()",
true // advanced option
); );
argList::addOption argList::addOption
@ -596,6 +603,7 @@ void Foam::argList::noParallel()
removeOption("hostRoots"); removeOption("hostRoots");
removeOption("world"); removeOption("world");
removeOption("mpi-threads"); removeOption("mpi-threads");
removeOption("mpi-no-comm-dup");
validParOptions.clear(); validParOptions.clear();
} }

View File

@ -100,7 +100,8 @@ void Foam::UPstream::splitCommunicatorComponents
( (
const label parentIndex, const label parentIndex,
const label index, const label index,
int colour int colour,
const bool two_step
) )
{} {}

View File

@ -262,34 +262,61 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
} }
// Check argument list for local world // Check argument list for any of the following:
label worldIndex = -1; // - local world
// -> Extract world name and filter out '-world <name>' from argv list
// - mpi-no-comm-dup option
// -> disable initial comm_dup and filter out the option
// Default handling of initial MPI_Comm_dup(MPI_COMM_WORLD,...)
UPstream::noInitialCommDup_ = false;
// Local world name
word worldName;
for (int argi = 1; argi < argc; ++argi) for (int argi = 1; argi < argc; ++argi)
{ {
if (strcmp(argv[argi], "-world") == 0) const char *optName = argv[argi];
if (optName[0] != '-')
{
continue;
}
++optName; // Looks like an option, skip leading '-'
if (strcmp(optName, "world") == 0)
{ {
worldIndex = argi;
if (argi+1 >= argc) if (argi+1 >= argc)
{ {
FatalErrorInFunction FatalErrorInFunction
<< "Missing world name for option '-world'" << nl << "Missing world name for option '-world'" << nl
<< Foam::abort(FatalError); << Foam::abort(FatalError);
} }
break; worldName = argv[argi+1];
// Remove two arguments (-world name)
for (int i = argi+2; i < argc; ++i)
{
argv[i-2] = argv[i];
}
argc -= 2;
--argi; // re-examine
}
else if (strcmp(optName, "mpi-no-comm-dup") == 0)
{
UPstream::noInitialCommDup_ = true;
// Remove one argument
for (int i = argi+1; i < argc; ++i)
{
argv[i-1] = argv[i];
}
--argc;
--argi; // re-examine
} }
} }
// Extract world name and filter out '-world <name>' from argv list const bool hasLocalWorld(!worldName.empty());
word worldName;
if (worldIndex != -1)
{
worldName = argv[worldIndex+1];
for (label i = worldIndex+2; i < argc; i++)
{
argv[i-2] = argv[i];
}
argc -= 2;
}
int numProcs = 0, globalRanki = 0; int numProcs = 0, globalRanki = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &globalRanki); MPI_Comm_rank(MPI_COMM_WORLD, &globalRanki);
@ -314,7 +341,7 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
<< " world:" << worldName << endl; << " world:" << worldName << endl;
} }
if (worldIndex == -1 && numProcs <= 1) if (numProcs <= 1 && !(hasLocalWorld))
{ {
FatalErrorInFunction FatalErrorInFunction
<< "attempt to run parallel on 1 processor" << "attempt to run parallel on 1 processor"
@ -324,7 +351,7 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
// Initialise parallel structure // Initialise parallel structure
setParRun(numProcs, provided_thread_support == MPI_THREAD_MULTIPLE); setParRun(numProcs, provided_thread_support == MPI_THREAD_MULTIPLE);
if (worldIndex != -1) if (hasLocalWorld)
{ {
// Using local worlds. // Using local worlds.
// During startup, so commWorld() == commGlobal() // During startup, so commWorld() == commGlobal()
@ -333,7 +360,7 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
// Gather the names of all worlds and determine unique names/indices. // Gather the names of all worlds and determine unique names/indices.
// //
// Minimize communication and use low-level MPI to relying on any // Minimize communication and use low-level MPI to avoid relying on any
// OpenFOAM structures which not yet have been created // OpenFOAM structures which not yet have been created
{ {
@ -619,10 +646,10 @@ void Foam::UPstream::exit(int errNo)
void Foam::UPstream::abort(int errNo) void Foam::UPstream::abort(int errNo)
{ {
// TBD: only abort on our own communicator?
#if 0
MPI_Comm abortComm = MPI_COMM_WORLD; MPI_Comm abortComm = MPI_COMM_WORLD;
// TBD: only abort on our own communicator?
#if 0
const label index = UPstream::commGlobal(); const label index = UPstream::commGlobal();
if (index > 0 && index < PstreamGlobals::MPICommunicators_.size()) if (index > 0 && index < PstreamGlobals::MPICommunicators_.size())
@ -633,10 +660,9 @@ void Foam::UPstream::abort(int errNo)
abortComm = MPI_COMM_WORLD; abortComm = MPI_COMM_WORLD;
} }
} }
MPI_Abort(abortComm, errNo);
#endif #endif
MPI_Abort(MPI_COMM_WORLD, errNo); MPI_Abort(abortComm, errNo);
} }
@ -665,11 +691,16 @@ void Foam::UPstream::allocateCommunicatorComponents
} }
auto& mpiNewComm = PstreamGlobals::MPICommunicators_[index]; auto& mpiNewComm = PstreamGlobals::MPICommunicators_[index];
// PstreamGlobals::pendingMPIFree_[index] = false; if (UPstream::noInitialCommDup_)
// PstreamGlobals::MPICommunicators_[index] = MPI_COMM_WORLD; {
PstreamGlobals::pendingMPIFree_[index] = false;
PstreamGlobals::pendingMPIFree_[index] = true; PstreamGlobals::MPICommunicators_[index] = MPI_COMM_WORLD;
MPI_Comm_dup(MPI_COMM_WORLD, &mpiNewComm); }
else
{
PstreamGlobals::pendingMPIFree_[index] = true;
MPI_Comm_dup(MPI_COMM_WORLD, &mpiNewComm);
}
MPI_Comm_rank(mpiNewComm, &myProcNo_[index]); MPI_Comm_rank(mpiNewComm, &myProcNo_[index]);
@ -807,7 +838,8 @@ void Foam::UPstream::splitCommunicatorComponents
( (
const label parentIndex, const label parentIndex,
const label index, const label index,
int colour int colour,
const bool two_step
) )
{ {
PstreamGlobals::initCommunicator(index); PstreamGlobals::initCommunicator(index);
@ -823,7 +855,7 @@ void Foam::UPstream::splitCommunicatorComponents
// the relative rank order when splitting). // the relative rank order when splitting).
// //
// Since MPI_Comm_split() already does an MPI_Allgather() internally // Since MPI_Comm_split() already does an MPI_Allgather() internally
// to pick out the colours (and do any sorting), we can simply to // to pick out the colours (and do any sorting), we can simply
// do the same thing: // do the same thing:
// //
// Do the Allgather first and pickout identical colours to define the // Do the Allgather first and pickout identical colours to define the
@ -840,39 +872,106 @@ void Foam::UPstream::splitCommunicatorComponents
MPI_Comm_rank(mpiParentComm, &parentRank); MPI_Comm_rank(mpiParentComm, &parentRank);
MPI_Comm_size(mpiParentComm, &parentSize); MPI_Comm_size(mpiParentComm, &parentSize);
// Initialize, first marking the 'procIDs_' with the colours
auto& procIds = procIDs_[index]; auto& procIds = procIDs_[index];
myProcNo_[index] = -1; myProcNo_[index] = -1;
procIds.resize_nocopy(parentSize);
procIds[parentRank] = colour;
MPI_Allgather if (two_step)
(
MPI_IN_PLACE, 0, MPI_INT,
procIds.data(), 1, MPI_INT,
mpiParentComm
);
if (colour < 0)
{ {
procIds.clear(); // First gather the colours
procIds.resize_nocopy(parentSize);
procIds[parentRank] = colour;
MPI_Allgather
(
MPI_IN_PLACE, 0, MPI_INT,
procIds.data(), 1, MPI_INT,
mpiParentComm
);
if (colour < 0)
{
// Not involved
procIds.clear();
}
else
{
// Select ranks based on the matching colour
int nranks = 0;
for (int i = 0; i < parentSize; ++i)
{
if (procIds[i] == colour)
{
procIds[nranks++] = i;
}
}
procIds.resize(nranks);
}
allocateCommunicatorComponents(parentIndex, index);
} }
else else
{ {
auto last = auto& mpiNewComm = PstreamGlobals::MPICommunicators_[index];
std::copy_if
MPI_Comm_split
(
mpiParentComm,
(colour >= 0 ? colour : MPI_UNDEFINED),
0, // maintain relative ordering
&mpiNewComm
);
if (MPI_COMM_NULL == mpiNewComm)
{
// Not involved
PstreamGlobals::pendingMPIFree_[index] = false;
procIds.clear();
}
else
{
PstreamGlobals::pendingMPIFree_[index] = true;
MPI_Comm_rank(mpiNewComm, &myProcNo_[index]);
// Starting from parent
MPI_Group parent_group;
MPI_Comm_group(mpiParentComm, &parent_group);
MPI_Group new_group;
MPI_Comm_group(mpiNewComm, &new_group);
// Parent ranks: identity map
List<int> parentIds(parentSize);
std::iota(parentIds.begin(), parentIds.end(), 0);
// New ranks:
procIds.resize_nocopy(parentSize);
procIds = -1; // Some extra safety...
MPI_Group_translate_ranks
( (
procIds.cbegin(), parent_group, parentSize, parentIds.data(),
procIds.cend(), new_group, procIds.data()
procIds.begin(),
[=](int c){ return (c == colour); }
); );
procIds.resize(std::distance(procIds.begin(), last)); // Groups not needed after this...
} MPI_Group_free(&parent_group);
MPI_Group_free(&new_group);
allocateCommunicatorComponents(parentIndex, index); // The corresponding ranks.
// - since old ranks are an identity map, can just use position
int nranks = 0;
for (int i = 0; i < parentSize; ++i)
{
// Exclude MPI_UNDEFINED and MPI_PROC_NULL etc...
if (procIds[i] >= 0 && procIds[i] < parentSize)
{
procIds[nranks++] = i;
}
}
procIds.resize(nranks);
}
}
} }