ENH: extend parProfiling (#2737)

- separate broadcast times from reduce/gather/scatter time
- separate wait times from all-to-all time
- support invocation counts, split off requests time/count
  from others to avoid flooding the counts

- support 'detail' switch to increase the output information.
  Format may change in the future
This commit is contained in:
Mark Olesen
2023-04-04 20:54:15 +02:00
parent 475ed5cc32
commit 9577a0f6b5
13 changed files with 520 additions and 168 deletions

View File

@ -5,7 +5,7 @@
\\ / A nd | www.openfoam.com
\\/ M anipulation |
-------------------------------------------------------------------------------
Copyright (C) 2019-2022 OpenCFD Ltd.
Copyright (C) 2019-2023 OpenCFD Ltd.
-------------------------------------------------------------------------------
License
This file is part of OpenFOAM.
@ -61,8 +61,10 @@ Foam::functionObjects::parProfiling::parProfiling
const dictionary& dict
)
:
functionObject(name)
functionObject(name),
detailLevel_(0)
{
dict.readIfPresent("detail", detailLevel_);
profilingPstream::enable();
}
@ -75,121 +77,365 @@ Foam::functionObjects::parProfiling::~parProfiling()
}
// * * * * * * * * * * * * * * * Member Functions * * * * * * * * * * * * * //
// * * * * * * * * * * * * * * * Local Functions * * * * * * * * * * * * * * //
void Foam::functionObjects::parProfiling::report()
namespace Foam
{
if (!profilingPstream::active())
// Loop over all values (with striding) and extract the value at given index
template<class Type>
inline static void extractValues
(
UList<Type>& result,
const int index,
const UList<Type>& allValues
)
{
if (result.empty())
{
return;
}
// (Time, Processor) for each of: min/max/sum
typedef FixedList<Tuple2<double, int>, 3> statData;
typedef FixedList<statData, 3> statDataTimes;
const label numProc = result.size();
const Type* values = allValues.cbegin();
const label stride = allValues.size() / numProc;
// Reduction: if x and y are unequal assign value.
auto statsEqOp = [](statDataTimes& xStats, const statDataTimes& yStats)
if (!values || !stride)
{
forAll(xStats, i)
result = Type(0);
return;
}
for (label proci = 0; proci < numProc; ++proci, values += stride)
{
result[proci] = values[index];
}
}
// Loop over all values (with striding) and extract combined value
// using the given unary function
template<class Type, class Extract>
inline static void extractValues
(
UList<Type>& result,
const UList<Type>& allValues,
const Extract& extract
)
{
if (result.empty())
{
return;
}
const label numProc = result.size();
const Type* values = allValues.cbegin();
const label stride = allValues.size() / numProc;
if (!values || !stride)
{
result = Type(0);
return;
}
for (label proci = 0; proci < numProc; ++proci, values += stride)
{
result[proci] = extract(values);
}
}
inline static void printTimingDetail(const UList<double>& values)
{
const label numProc = values.size();
if (numProc)
{
Info<< indent << " times " << numProc << '(';
for (label proci = 0; proci < numProc; ++proci)
{
statData& x = xStats[i];
const statData& y = yStats[i];
// 0: min, 1: max, 2: total (or avg)
if (x[0].first() > y[0].first())
{
x[0] = y[0];
}
if (x[1].first() < y[1].first())
{
x[1] = y[1];
}
x[2].first() += y[2].first();
if (proci) Info<< ' ';
Info<< values[proci];
}
};
statDataTimes times;
Info<< ')' << nl;
}
}
// Master time
inline static void printTimingDetail(const UList<uint64_t>& values)
{
const label numProc = values.size();
if (numProc)
{
const double total =
(
profilingPstream::times(profilingPstream::REDUCE)
+ profilingPstream::times(profilingPstream::GATHER)
+ profilingPstream::times(profilingPstream::SCATTER)
// Include broadcast with reduce instead of all-to-all
+ profilingPstream::times(profilingPstream::BROADCAST)
);
// Output via std::ostream to avoid conversion to Foam::label
// that Ostream performs
times[0] = Tuple2<double, int>(total, Pstream::myProcNo());
auto& os = Info.stdStream();
Info<< indent << " counts " << numProc << '(';
for (label proci = 0; proci < numProc; ++proci)
{
if (proci) os << ' ';
os << values[proci];
}
Info<< ')' << nl;
}
}
} // End namespace Foam
// * * * * * * * * * * * * * * * Member Functions * * * * * * * * * * * * * //
void Foam::functionObjects::parProfiling::report()
{
const label numProc = (UPstream::parRun() ? UPstream::nProcs() : 1);
if (!profilingPstream::active() || numProc < 2)
{
return;
}
// All time
{
const double total =
(
profilingPstream::times(profilingPstream::WAIT)
+ profilingPstream::times(profilingPstream::ALL_TO_ALL)
+ profilingPstream::times(profilingPstream::OTHER)
);
// Use mpiGather on all values and perform the combinations
// and statistics locally. This reduces the overall number of MPI
// calls. For detailed output we need this information anyhow.
times[1] = Tuple2<double, int>(total, Pstream::myProcNo());
}
// NB: profilingPstream uses a FixedList for timings(), counts()
// so the sizes are guaranteed to be consistent and identical
// everywhere.
// Other time
{
const double total =
(
profilingPstream::times(profilingPstream::OTHER)
);
times[2] = Tuple2<double, int>(total, Pstream::myProcNo());
}
List<double> allTimes;
List<uint64_t> allCounts;
// Avoid disturbing the counts
profilingPstream::suspend();
Pstream::combineGather(times, statsEqOp);
{
// The timings
const auto& procTimes = profilingPstream::times();
if (Pstream::master())
{
allTimes.resize(numProc * procTimes.size());
}
UPstream::mpiGather
(
procTimes.cdata_bytes(), // Send
procTimes.size_bytes(), // Num send per proc
allTimes.data_bytes(), // Recv
procTimes.size_bytes(), // Num recv per proc
UPstream::commWorld()
);
}
if (detailLevel_ > 1)
{
// The counts
const auto& procCounts = profilingPstream::counts();
if (Pstream::master())
{
allCounts.resize(numProc * procCounts.size());
}
UPstream::mpiGather
(
procCounts.cdata_bytes(), // Send
procCounts.size_bytes(), // Num send per proc
allCounts.data_bytes(), // Recv
procCounts.size_bytes(), // Num recv per proc
UPstream::commWorld()
);
}
profilingPstream::resume();
// (Time, Processor) for each of: min/max/sum(avg)
typedef FixedList<Tuple2<double, int>, 3> statData;
// Extract min/max/average
auto calcStats = [](const UList<double>& data) -> statData
{
statData stats;
stats = Tuple2<double, int>((data.empty() ? 0 : data[0]), 0);
const label np = data.size();
for (label proci = 1; proci < np; ++proci)
{
Tuple2<double, int> tup(data[proci], proci);
// 0: min, 1: max, 2: total(avg)
if (stats[0].first() > tup.first()) stats[0] = tup;
if (stats[1].first() < tup.first()) stats[1] = tup;
stats[2].first() += tup.first();
}
// From total -> average value
if (np) { stats[2].first() /= np; }
return stats;
};
const auto printTimingStats =
[&](Ostream& os, const char* tag, const statData& stats)
{
os << indent << tag << ": avg = " << stats[2].first()
<< ", min = " << stats[0].first()
<< " (proc " << stats[0].second() << ')'
<< ", max = " << stats[1].first()
<< " (proc " << stats[1].second() << ')'
<< nl;
};
if (Pstream::master())
{
statData stats;
List<double> extractedTimes(numProc);
List<uint64_t> extractedCounts;
if (detailLevel_ > 1)
{
extractedCounts.resize(numProc);
}
Info<< type() << ':' << nl
<< incrIndent;
// Total times
{
const statData& stats = times[0];
double avg = stats[2].first()/Pstream::nProcs();
extractValues
(
extractedTimes,
allTimes,
[=](const double values[])
{
double total = 0;
for (unsigned i = 0; i < profilingPstream::nCategories; ++i)
{
total += values[i];
}
return total;
}
);
stats = calcStats(extractedTimes);
Info<< indent << "reduce : avg = " << avg << 's' << nl
<< indent << " min = " << stats[0].first()
<< "s (processor " << stats[0].second() << ')' << nl
<< indent << " max = " << stats[1].first()
<< "s (processor " << stats[1].second() << ')' << nl;
printTimingStats(Info(), "total ", stats);
if (detailLevel_ > 0) printTimingDetail(extractedTimes);
}
// all-all
{
const statData& stats = times[1];
double avg = stats[2].first()/Pstream::nProcs();
const int index = int(profilingPstream::ALL_TO_ALL);
Info<< indent << "all-all : avg = " << avg << 's' << nl
<< indent << " min = " << stats[0].first()
<< "s (processor " << stats[0].second() << ')' << nl
<< indent << " max = " << stats[1].first()
<< "s (processor " << stats[1].second() << ')' << nl;
extractValues(extractedTimes, index, allTimes);
extractValues(extractedCounts, index, allCounts);
stats = calcStats(extractedTimes);
printTimingStats(Info(), "all-all ", stats);
if (detailLevel_ > 0) printTimingDetail(extractedTimes);
if (detailLevel_ > 1) printTimingDetail(extractedCounts);
}
// broadcast
{
const statData& stats = times[2];
double avg = stats[2].first()/Pstream::nProcs();
const int index = int(profilingPstream::BROADCAST);
Info<< indent << "other : avg = " << avg << 's' << nl
<< indent << " min = " << stats[0].first()
<< "s (processor " << stats[0].second() << ')' << nl
<< indent << " max = " << stats[1].first()
<< "s (processor " << stats[1].second() << ')' << nl;
extractValues(extractedTimes, index, allTimes);
extractValues(extractedCounts, index, allCounts);
stats = calcStats(extractedTimes);
printTimingStats(Info(), "broadcast ", stats);
if (detailLevel_ > 0) printTimingDetail(extractedTimes);
if (detailLevel_ > 1) printTimingDetail(extractedCounts);
}
// probe
{
const int index = int(profilingPstream::PROBE);
extractValues(extractedTimes, index, allTimes);
extractValues(extractedCounts, index, allCounts);
stats = calcStats(extractedTimes);
printTimingStats(Info(), "probe ", stats);
if (detailLevel_ > 0) printTimingDetail(extractedTimes);
if (detailLevel_ > 1) printTimingDetail(extractedCounts);
}
// Reduce/scatter times
{
// const int index = int(profilingPstream::REDUCE);
extractValues
(
extractedTimes,
allTimes,
[=](const double values[])
{
return
(
values[profilingPstream::REDUCE]
+ values[profilingPstream::GATHER]
+ values[profilingPstream::SCATTER]
);
}
);
extractValues
(
extractedCounts,
allCounts,
[=](const uint64_t values[])
{
return
(
values[profilingPstream::REDUCE]
+ values[profilingPstream::GATHER]
+ values[profilingPstream::SCATTER]
);
}
);
stats = calcStats(extractedTimes);
printTimingStats(Info(), "reduce ", stats);
if (detailLevel_ > 0) printTimingDetail(extractedTimes);
if (detailLevel_ > 1) printTimingDetail(extractedCounts);
}
// request
{
const int index = int(profilingPstream::REQUEST);
extractValues(extractedTimes, index, allTimes);
extractValues(extractedCounts, index, allCounts);
stats = calcStats(extractedTimes);
printTimingStats(Info(), "request ", stats);
if (detailLevel_ > 0) printTimingDetail(extractedTimes);
if (detailLevel_ > 1) printTimingDetail(extractedCounts);
}
// wait
{
const int index = int(profilingPstream::WAIT);
extractValues(extractedTimes, index, allTimes);
extractValues(extractedCounts, index, allCounts);
stats = calcStats(extractedTimes);
printTimingStats(Info(), "wait ", stats);
if (detailLevel_ > 0) printTimingDetail(extractedTimes);
if (detailLevel_ > 1) printTimingDetail(extractedCounts);
}
Info<< decrIndent;

View File

@ -5,7 +5,7 @@
\\ / A nd | www.openfoam.com
\\/ M anipulation |
-------------------------------------------------------------------------------
Copyright (C) 2019-2022 OpenCFD Ltd.
Copyright (C) 2019-2023 OpenCFD Ltd.
-------------------------------------------------------------------------------
License
This file is part of OpenFOAM.
@ -43,6 +43,7 @@ Usage
// Report stats on exit only (instead of every time step)
executeControl onEnd;
writeControl none;
detail 0;
}
\endverbatim
@ -60,10 +61,6 @@ SourceFiles
namespace Foam
{
// Forward Declarations
class Time;
namespace functionObjects
{
@ -75,7 +72,15 @@ class parProfiling
:
public functionObject
{
// Private Member Functions
// Private Data
//- The level of detail
// 0: summary, 1: per-proc times, 2: per-proc times/counts
int detailLevel_;
public:
// Generated Methods
//- No copy construct
parProfiling(const parProfiling&) = delete;
@ -84,8 +89,6 @@ class parProfiling
void operator=(const parProfiling&) = delete;
public:
//- Runtime type information
TypeName("parProfiling");
@ -116,7 +119,7 @@ public:
//- Do nothing
virtual bool write();
//- Report
//- Disables profilingPstream
virtual bool end();
};