ENH: allow disabling of initial MPI_Comm_dup(MPI_COMM_WORLD,...)

- can use -mpi-no-comm-dup to suppress the initial communicator
  duplication (to avoid potential deadlock with coupled processes).
  This is partly related to comments in merge-request !735

ENH: simplify parsing/removal of local -world option

- can extract the world name in a single pass and also makes the
  parsing robuster.

ENH: support regular MPI_Comm_split()

- the two-step method of Allgather + Comm_create_group may not be
  expected by other applications (issue #3127) and that can lead to
  deadlock, so also add in code for the regular MPI_Comm_split.
  Does not support re-sorting keys!

FIX: faulty logic for splitting communicators

- only affected more recent develop branch
This commit is contained in:
Mark Olesen
2025-04-30 09:59:59 +02:00
parent c5ceec3c73
commit 34143b433a
9 changed files with 222 additions and 68 deletions

View File

@ -1,3 +1,3 @@
Test-parallel-comm2.C
Test-parallel-comm2.cxx
EXE = $(FOAM_USER_APPBIN)/Test-parallel-comm2

View File

@ -69,6 +69,7 @@ int main(int argc, char *argv[])
argList::addBoolOption("info", "information");
argList::addBoolOption("print-tree", "Report tree(s) as graph");
argList::addBoolOption("no-test", "Disable general tests");
argList::addBoolOption("split", "Test Pstream split-comm");
argList::addBoolOption("host-comm", "Test Pstream host-comm");
argList::addBoolOption("host-broadcast", "Test host-base broadcasts");
@ -85,8 +86,8 @@ int main(int argc, char *argv[])
if (UPstream::parRun() && optPrintTree)
{
Info<< "comms: "
<< UPstream::whichCommunication(UPstream::worldComm) << nl;
// Info<< "comms: "
// << UPstream::whichCommunication(UPstream::worldComm) << nl;
UPstream::printCommTree(UPstream::commWorld());
}
@ -102,6 +103,34 @@ int main(int argc, char *argv[])
<< flatOutput(UPstream::procID(UPstream::commLocalNode())) << nl;
}
if (UPstream::parRun() && args.found("split"))
{
Info<< "split: alternative ranks" << nl;
const auto myRank = UPstream::myProcNo();
int colour =
(
(myRank == 5 || myRank == 6) // Exclude these ones
? -1
: (myRank % 2)
);
UPstream::communicator comm =
UPstream::communicator::split(UPstream::commWorld(), colour, true);
Pout<< "split ranks (colour=" << colour << ") "
<< flatOutput(UPstream::procID(comm.comm())) << nl;
comm.reset();
comm =
UPstream::communicator::split(UPstream::commWorld(), colour, false);
Pout<< "Split ranks (colour=" << colour << ") "
<< flatOutput(UPstream::procID(comm.comm())) << nl;
}
if (args.found("info"))
{
Info<< nl;
@ -135,8 +164,8 @@ int main(int argc, char *argv[])
<< endl;
{
Info<< "host-master: "
<< UPstream::whichCommunication(commInterNode) << endl;
// Info<< "host-master: "
// << UPstream::whichCommunication(commInterNode) << endl;
UPstream::printCommTree(commInterNode);
UPstream::printCommTree(commLocalNode);

View File

@ -6,7 +6,7 @@
# \\ / A nd | www.openfoam.com
# \\/ M anipulation |
#------------------------------------------------------------------------------
# Copyright (C) 2017-2023 OpenCFD Ltd.
# Copyright (C) 2017-2025 OpenCFD Ltd.
#------------------------------------------------------------------------------
# License
# This file is part of OpenFOAM, distributed under GPL-3.0-or-later.
@ -168,6 +168,7 @@ extractOptions()
-e '/^-doc-source/d; /^-help-man/d;' \
-e '/^-hostRoots /d; /^-roots /d;' \
-e '/^-lib /d; /^-no-libs /d;' \
-e '/^-mpi-.*/d;' \
-e '/^-[a-z]*-switch /d;' \
-e 'y/,/ /; s/=.*$/=/;' \
-e '/^-[^ ]* </{ s/^\(-[^ ]* <\).*$/\1/; p; d }' \

View File

@ -6,7 +6,7 @@
# \\ / A nd | www.openfoam.com
# \\/ M anipulation |
#-------------------------------------------------------------------------------
# Copyright (C) 2020-2023 OpenCFD Ltd.
# Copyright (C) 2020-2025 OpenCFD Ltd.
#------------------------------------------------------------------------------
# License
# This file is part of OpenFOAM, distributed under GPL-3.0-or-later.
@ -28,6 +28,7 @@ sed -ne '1,/^[Oo]ptions:/d' \
-e '/^-doc-source/d; /^-help-man/d;' \
-e '/^-hostRoots /d; /^-roots /d;' \
-e '/^-lib /d; /^-no-libs /d;' \
-e '/^-mpi-.*/d;' \
-e '/^-[a-z]*-switch /d;' \
-e 'y/,/ /; s/=.*$/=/;' \
-e '/^-[^ ]* </{ s/^\(-[^ ]* <\).*$/\1/; p; d }' \

View File

@ -445,7 +445,8 @@ Foam::label Foam::UPstream::dupCommunicator
Foam::label Foam::UPstream::splitCommunicator
(
const label parentIndex,
const int colour
const int colour,
const bool two_step
)
{
#ifdef FULLDEBUG
@ -465,7 +466,8 @@ Foam::label Foam::UPstream::splitCommunicator
{
Perr<< "Split communicator ["
<< index << "] from [" << parentIndex
<< "] using colour=" << colour << endl;
<< "] using colour=" << colour
<< " (two_step=" << two_step << ")" << endl;
}
// Initially treat as unknown,
@ -475,7 +477,7 @@ Foam::label Foam::UPstream::splitCommunicator
if (UPstream::parRun())
{
splitCommunicatorComponents(parentIndex, index, colour);
splitCommunicatorComponents(parentIndex, index, colour, two_step);
}
return index;
@ -857,6 +859,8 @@ bool Foam::UPstream::parRun_(false);
bool Foam::UPstream::haveThreads_(false);
bool Foam::UPstream::noInitialCommDup_(false);
int Foam::UPstream::msgType_(1);

View File

@ -397,6 +397,9 @@ private:
//- Have support for threads?
static bool haveThreads_;
//- Initial MPI_Comm_dup(MPI_COMM_WORLD,...) disabled? (default: false)
static bool noInitialCommDup_;
//- Standard transfer message type
static int msgType_;
@ -495,7 +498,9 @@ private:
(
const label parentIndex,
const label index,
const int colour
const int colour,
//! Use MPI_Allgather+MPI_Comm_create_group vs MPI_Comm_split
const bool two_step = true
);
//- Free MPI components of communicator.
@ -877,7 +882,10 @@ public:
//! The colouring to select which ranks to include.
//! Negative values correspond to 'ignore'
const int colour
const int colour,
//! Use MPI_Allgather+MPI_Comm_create_group vs MPI_Comm_split
const bool two_step = true
);
//- Free a previously allocated communicator.
@ -949,11 +957,14 @@ public:
const label parentComm,
//! The colouring to select which ranks to include.
//! Negative values correspond to 'ignore'
const int colour
const int colour,
//! Use MPI_Allgather+MPI_Comm_create_group vs MPI_Comm_split
const bool two_step = true
)
{
communicator c;
c.comm_ = UPstream::splitCommunicator(parentComm, colour);
c.comm_ =
UPstream::splitCommunicator(parentComm, colour, two_step);
return c;
}

View File

@ -128,7 +128,14 @@ Foam::argList::initValidTables::initValidTables()
(
"mpi-threads",
"Request use of MPI threads",
true // advanced option
true // advanced option
);
argList::addBoolOption
(
"mpi-no-comm-dup",
"Disable initial MPI_Comm_dup()",
true // advanced option
);
argList::addOption
@ -596,6 +603,7 @@ void Foam::argList::noParallel()
removeOption("hostRoots");
removeOption("world");
removeOption("mpi-threads");
removeOption("mpi-no-comm-dup");
validParOptions.clear();
}

View File

@ -100,7 +100,8 @@ void Foam::UPstream::splitCommunicatorComponents
(
const label parentIndex,
const label index,
int colour
int colour,
const bool two_step
)
{}

View File

@ -262,34 +262,61 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
}
// Check argument list for local world
label worldIndex = -1;
// Check argument list for any of the following:
// - local world
// -> Extract world name and filter out '-world <name>' from argv list
// - mpi-no-comm-dup option
// -> disable initial comm_dup and filter out the option
// Default handling of initial MPI_Comm_dup(MPI_COMM_WORLD,...)
UPstream::noInitialCommDup_ = false;
// Local world name
word worldName;
for (int argi = 1; argi < argc; ++argi)
{
if (strcmp(argv[argi], "-world") == 0)
const char *optName = argv[argi];
if (optName[0] != '-')
{
continue;
}
++optName; // Looks like an option, skip leading '-'
if (strcmp(optName, "world") == 0)
{
worldIndex = argi;
if (argi+1 >= argc)
{
FatalErrorInFunction
<< "Missing world name for option '-world'" << nl
<< Foam::abort(FatalError);
}
break;
worldName = argv[argi+1];
// Remove two arguments (-world name)
for (int i = argi+2; i < argc; ++i)
{
argv[i-2] = argv[i];
}
argc -= 2;
--argi; // re-examine
}
else if (strcmp(optName, "mpi-no-comm-dup") == 0)
{
UPstream::noInitialCommDup_ = true;
// Remove one argument
for (int i = argi+1; i < argc; ++i)
{
argv[i-1] = argv[i];
}
--argc;
--argi; // re-examine
}
}
// Extract world name and filter out '-world <name>' from argv list
word worldName;
if (worldIndex != -1)
{
worldName = argv[worldIndex+1];
for (label i = worldIndex+2; i < argc; i++)
{
argv[i-2] = argv[i];
}
argc -= 2;
}
const bool hasLocalWorld(!worldName.empty());
int numProcs = 0, globalRanki = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &globalRanki);
@ -314,7 +341,7 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
<< " world:" << worldName << endl;
}
if (worldIndex == -1 && numProcs <= 1)
if (numProcs <= 1 && !(hasLocalWorld))
{
FatalErrorInFunction
<< "attempt to run parallel on 1 processor"
@ -324,7 +351,7 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
// Initialise parallel structure
setParRun(numProcs, provided_thread_support == MPI_THREAD_MULTIPLE);
if (worldIndex != -1)
if (hasLocalWorld)
{
// Using local worlds.
// During startup, so commWorld() == commGlobal()
@ -333,7 +360,7 @@ bool Foam::UPstream::init(int& argc, char**& argv, const bool needsThread)
// Gather the names of all worlds and determine unique names/indices.
//
// Minimize communication and use low-level MPI to relying on any
// Minimize communication and use low-level MPI to avoid relying on any
// OpenFOAM structures which not yet have been created
{
@ -619,10 +646,10 @@ void Foam::UPstream::exit(int errNo)
void Foam::UPstream::abort(int errNo)
{
// TBD: only abort on our own communicator?
#if 0
MPI_Comm abortComm = MPI_COMM_WORLD;
// TBD: only abort on our own communicator?
#if 0
const label index = UPstream::commGlobal();
if (index > 0 && index < PstreamGlobals::MPICommunicators_.size())
@ -633,10 +660,9 @@ void Foam::UPstream::abort(int errNo)
abortComm = MPI_COMM_WORLD;
}
}
MPI_Abort(abortComm, errNo);
#endif
MPI_Abort(MPI_COMM_WORLD, errNo);
MPI_Abort(abortComm, errNo);
}
@ -665,11 +691,16 @@ void Foam::UPstream::allocateCommunicatorComponents
}
auto& mpiNewComm = PstreamGlobals::MPICommunicators_[index];
// PstreamGlobals::pendingMPIFree_[index] = false;
// PstreamGlobals::MPICommunicators_[index] = MPI_COMM_WORLD;
PstreamGlobals::pendingMPIFree_[index] = true;
MPI_Comm_dup(MPI_COMM_WORLD, &mpiNewComm);
if (UPstream::noInitialCommDup_)
{
PstreamGlobals::pendingMPIFree_[index] = false;
PstreamGlobals::MPICommunicators_[index] = MPI_COMM_WORLD;
}
else
{
PstreamGlobals::pendingMPIFree_[index] = true;
MPI_Comm_dup(MPI_COMM_WORLD, &mpiNewComm);
}
MPI_Comm_rank(mpiNewComm, &myProcNo_[index]);
@ -807,7 +838,8 @@ void Foam::UPstream::splitCommunicatorComponents
(
const label parentIndex,
const label index,
int colour
int colour,
const bool two_step
)
{
PstreamGlobals::initCommunicator(index);
@ -823,7 +855,7 @@ void Foam::UPstream::splitCommunicatorComponents
// the relative rank order when splitting).
//
// Since MPI_Comm_split() already does an MPI_Allgather() internally
// to pick out the colours (and do any sorting), we can simply to
// to pick out the colours (and do any sorting), we can simply
// do the same thing:
//
// Do the Allgather first and pickout identical colours to define the
@ -840,39 +872,106 @@ void Foam::UPstream::splitCommunicatorComponents
MPI_Comm_rank(mpiParentComm, &parentRank);
MPI_Comm_size(mpiParentComm, &parentSize);
// Initialize, first marking the 'procIDs_' with the colours
auto& procIds = procIDs_[index];
myProcNo_[index] = -1;
procIds.resize_nocopy(parentSize);
procIds[parentRank] = colour;
MPI_Allgather
(
MPI_IN_PLACE, 0, MPI_INT,
procIds.data(), 1, MPI_INT,
mpiParentComm
);
if (colour < 0)
if (two_step)
{
procIds.clear();
// First gather the colours
procIds.resize_nocopy(parentSize);
procIds[parentRank] = colour;
MPI_Allgather
(
MPI_IN_PLACE, 0, MPI_INT,
procIds.data(), 1, MPI_INT,
mpiParentComm
);
if (colour < 0)
{
// Not involved
procIds.clear();
}
else
{
// Select ranks based on the matching colour
int nranks = 0;
for (int i = 0; i < parentSize; ++i)
{
if (procIds[i] == colour)
{
procIds[nranks++] = i;
}
}
procIds.resize(nranks);
}
allocateCommunicatorComponents(parentIndex, index);
}
else
{
auto last =
std::copy_if
auto& mpiNewComm = PstreamGlobals::MPICommunicators_[index];
MPI_Comm_split
(
mpiParentComm,
(colour >= 0 ? colour : MPI_UNDEFINED),
0, // maintain relative ordering
&mpiNewComm
);
if (MPI_COMM_NULL == mpiNewComm)
{
// Not involved
PstreamGlobals::pendingMPIFree_[index] = false;
procIds.clear();
}
else
{
PstreamGlobals::pendingMPIFree_[index] = true;
MPI_Comm_rank(mpiNewComm, &myProcNo_[index]);
// Starting from parent
MPI_Group parent_group;
MPI_Comm_group(mpiParentComm, &parent_group);
MPI_Group new_group;
MPI_Comm_group(mpiNewComm, &new_group);
// Parent ranks: identity map
List<int> parentIds(parentSize);
std::iota(parentIds.begin(), parentIds.end(), 0);
// New ranks:
procIds.resize_nocopy(parentSize);
procIds = -1; // Some extra safety...
MPI_Group_translate_ranks
(
procIds.cbegin(),
procIds.cend(),
procIds.begin(),
[=](int c){ return (c == colour); }
parent_group, parentSize, parentIds.data(),
new_group, procIds.data()
);
procIds.resize(std::distance(procIds.begin(), last));
}
// Groups not needed after this...
MPI_Group_free(&parent_group);
MPI_Group_free(&new_group);
allocateCommunicatorComponents(parentIndex, index);
// The corresponding ranks.
// - since old ranks are an identity map, can just use position
int nranks = 0;
for (int i = 0; i < parentSize; ++i)
{
// Exclude MPI_UNDEFINED and MPI_PROC_NULL etc...
if (procIds[i] >= 0 && procIds[i] < parentSize)
{
procIds[nranks++] = i;
}
}
procIds.resize(nranks);
}
}
}