Added ODE diagnostics to FixRxKokkos using Kokkos managed data.

- Added the diagnostics performance analysis routine to FixRxKokkos
  using Kokkos views.
TODO:
  - Switch to using Kokkos data for the per-iteration scratch data.
    How to allocate only enouch for each work-unit and not all
    iterations? Can the shared-memory scratch memory work for this,
    even for large sizes?
This commit is contained in:
Christopher Stone
2017-02-09 22:38:58 -05:00
parent 4e8351d9c8
commit 93d99ec8d0
2 changed files with 223 additions and 21 deletions

View File

@ -879,11 +879,22 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
// Average DPD volume. Used in the RHS function.
this->VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
/*if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
{
memory->create( diagnosticCounterPerODE[StepSum], nlocal, "FixRX::diagnosticCounterPerODE");
memory->create( diagnosticCounterPerODE[FuncSum], nlocal, "FixRX::diagnosticCounterPerODE");
}*/
memory->create_kokkos (k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps, nlocal, "FixRxKokkos::diagnosticCounterPerODEnSteps");
memory->create_kokkos (k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs, nlocal, "FixRxKokkos::diagnosticCounterPerODEnFuncs");
d_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.d_view;
d_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.d_view;
Kokkos::parallel_for ( nlocal,
LAMMPS_LAMBDA(const int i)
{
d_diagnosticCounterPerODEnSteps(i) = 0;
d_diagnosticCounterPerODEnFuncs(i) = 0;
}
);
}
Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
{
@ -930,12 +941,11 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
{
rkf45(nspecies, t_stop, y, rwork, &userData, counter_i);
//if (diagnosticFrequency == 1 && diagnosticCounterPerODE[StepSum] != NULL)
//if (diagnosticCounterPerODE[StepSum] != NULL)
//{
// diagnosticCounterPerODE[StepSum][i] = counter_i.nSteps;
// diagnosticCounterPerODE[FuncSum][i] = counter_i.nFuncs;
//}
if (diagnosticFrequency == 1)
{
d_diagnosticCounterPerODEnSteps(i) = counter_i.nSteps;
d_diagnosticCounterPerODEnFuncs(i) = counter_i.nFuncs;
}
}
// Store the solution back in dvector.
@ -975,10 +985,7 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
atomKK->modified ( Host, DVECTOR_MASK );
if (localTempFlag)
{
//delete [] dpdThetaLocal;
memory->destroy_kokkos(k_dpdThetaLocal, dpdThetaLocal);
}
TimerType timer_stop = getTimeStamp();
@ -997,12 +1004,12 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
error->warning(FLERR, sbuf);
}
/*
// Compute and report ODE diagnostics, if requested.
if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency != 0){
if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency != 0)
{
// Update the counters.
diagnosticCounter[StepSum] += nSteps;
diagnosticCounter[FuncSum] += nFuncs;
diagnosticCounter[StepSum] += TotalCounters.nSteps;
diagnosticCounter[FuncSum] += TotalCounters.nFuncs;
diagnosticCounter[TimeSum] += time_ODE;
diagnosticCounter[AtomSum] += nlocal;
diagnosticCounter[numDiagnosticCounters-1] ++;
@ -1011,11 +1018,193 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
((update->ntimestep - update->firststep) % diagnosticFrequency) == 0) ||
(diagnosticFrequency < 0 && update->ntimestep == update->laststep) )
this->odeDiagnostics();
}
}
for (int i = 0; i < numDiagnosticCounters; ++i)
if (diagnosticCounterPerODE[i])
memory->destroy( diagnosticCounterPerODE[i] );
} */
/* ---------------------------------------------------------------------- */
template <typename DeviceType>
void FixRxKokkos<DeviceType>::odeDiagnostics(void)
{
TimerType timer_start = getTimeStamp();
// Compute:
// 1) Average # of ODE integrator steps and RHS evaluations per atom globally.
// 2) RMS # of ...
// 3) Average # of ODE steps and RHS evaluations per MPI task.
// 4) RMS # of ODE steps and RHS evaluations per MPI task.
// 5) MAX # of ODE steps and RHS evaluations per MPI task.
//
// ... 1,2 are for ODE control diagnostics.
// ... 3-5 are for load balancing diagnostics.
//
// To do this, we'll need to
// a) Allreduce (sum) the sum of nSteps / nFuncs. Dividing by atom->natoms
// gives the avg # of steps/funcs per atom globally.
// b) Reduce (sum) to root the sum of squares of the differences.
// i) Sum_i (steps_i - avg_steps_global)^2
// ii) Sum_i (funcs_i - avg_funcs_global)^2
// iii) (avg_steps_local - avg_steps_global)^2
// iv) (avg_funcs_local - avg_funcs_global)^2
const int numCounters = numDiagnosticCounters-1;
// # of time-steps for averaging.
const int nTimes = this->diagnosticCounter[numDiagnosticCounters-1];
// # of ODE's per time-step (on average).
//const int nODEs = this->diagnosticCounter[AtomSum] / nTimes;
// Sum up the sums from each task.
double sums[numCounters];
double my_vals[numCounters];
double max_per_proc[numCounters];
double min_per_proc[numCounters];
// Compute counters per dpd time-step.
for (int i = 0; i < numCounters; ++i){
my_vals[i] = this->diagnosticCounter[i] / nTimes;
//printf("my sum[%d] = %f %d\n", i, my_vals[i], comm->me);
}
MPI_Allreduce (my_vals, sums, numCounters, MPI_DOUBLE, MPI_SUM, world);
MPI_Reduce (my_vals, max_per_proc, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
MPI_Reduce (my_vals, min_per_proc, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);
const double nODEs = sums[numCounters-1];
double avg_per_atom[numCounters], avg_per_proc[numCounters];
// Averages per-ODE and per-proc per time-step.
for (int i = 0; i < numCounters; ++i){
avg_per_atom[i] = sums[i] / nODEs;
avg_per_proc[i] = sums[i] / comm->nprocs;
}
// Sum up the differences from each task.
double sum_sq[2*numCounters];
double my_sum_sq[2*numCounters];
for (int i = 0; i < numCounters; ++i){
double diff_i = my_vals[i] - avg_per_proc[i];
my_sum_sq[i] = diff_i * diff_i;
}
double max_per_ODE[numCounters], min_per_ODE[numCounters];
// Process the per-ODE RMS of the # of steps/funcs
if (diagnosticFrequency == 1)
{
h_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.h_view;
h_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.h_view;
Kokkos::deep_copy( h_diagnosticCounterPerODEnSteps, d_diagnosticCounterPerODEnSteps );
Kokkos::deep_copy( h_diagnosticCounterPerODEnFuncs, d_diagnosticCounterPerODEnFuncs );
double my_max[numCounters], my_min[numCounters];
const int nlocal = atom->nlocal;
HAT::t_int_1d h_mask = atomKK->k_mask.h_view;
for (int i = 0; i < numCounters; ++i)
{
my_sum_sq[i+numCounters] = 0;
my_max[i] = 0;
my_min[i] = DBL_MAX;
}
for (int j = 0; j < nlocal; ++j)
if (h_mask(j) & groupbit)
{
int nSteps = h_diagnosticCounterPerODEnSteps(j);
double diff_nSteps = double( nSteps ) - avg_per_atom[StepSum];
my_sum_sq[StepSum+numCounters] += diff_nSteps*diff_nSteps;
my_max[StepSum] = std::max( my_max[StepSum], (double)nSteps );
my_min[StepSum] = std::min( my_min[StepSum], (double)nSteps );
int nFuncs = h_diagnosticCounterPerODEnFuncs(j);
double diff_nFuncs = double( nFuncs ) - avg_per_atom[FuncSum];
my_sum_sq[FuncSum+numCounters] += diff_nFuncs*diff_nFuncs;
my_max[FuncSum] = std::max( my_max[FuncSum], (double)nFuncs );
my_min[FuncSum] = std::min( my_min[FuncSum], (double)nFuncs );
}
memory->destroy_kokkos( k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps );
memory->destroy_kokkos( k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs );
MPI_Reduce (my_sum_sq, sum_sq, 2*numCounters, MPI_DOUBLE, MPI_SUM, 0, world);
MPI_Reduce (my_max, max_per_ODE, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
MPI_Reduce (my_min, min_per_ODE, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);
}
else
MPI_Reduce (my_sum_sq, sum_sq, numCounters, MPI_DOUBLE, MPI_SUM, 0, world);
TimerType timer_stop = getTimeStamp();
double time_local = getElapsedTime( timer_start, timer_stop );
if (comm->me == 0){
char smesg[128];
#define print_mesg(smesg) {\
if (screen) fprintf(screen,"%s\n", smesg); \
if (logfile) fprintf(logfile,"%s\n", smesg); }
sprintf(smesg, "FixRX::ODE Diagnostics: # of iters |# of rhs evals| run-time (sec) | # atoms");
print_mesg(smesg);
sprintf(smesg, " AVG per ODE : %-12.5g | %-12.5g | %-12.5g", avg_per_atom[0], avg_per_atom[1], avg_per_atom[2]);
print_mesg(smesg);
// only valid for single time-step!
if (diagnosticFrequency == 1){
double rms_per_ODE[numCounters];
for (int i = 0; i < numCounters; ++i)
rms_per_ODE[i] = sqrt( sum_sq[i+numCounters] / nODEs );
sprintf(smesg, " RMS per ODE : %-12.5g | %-12.5g ", rms_per_ODE[0], rms_per_ODE[1]);
print_mesg(smesg);
sprintf(smesg, " MAX per ODE : %-12.5g | %-12.5g ", max_per_ODE[0], max_per_ODE[1]);
print_mesg(smesg);
sprintf(smesg, " MIN per ODE : %-12.5g | %-12.5g ", min_per_ODE[0], min_per_ODE[1]);
print_mesg(smesg);
}
sprintf(smesg, " AVG per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", avg_per_proc[StepSum], avg_per_proc[FuncSum], avg_per_proc[TimeSum], avg_per_proc[AtomSum]);
print_mesg(smesg);
if (comm->nprocs > 1){
double rms_per_proc[numCounters];
for (int i = 0; i < numCounters; ++i)
rms_per_proc[i] = sqrt( sum_sq[i] / comm->nprocs );
sprintf(smesg, " RMS per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2], rms_per_proc[AtomSum]);
print_mesg(smesg);
sprintf(smesg, " MAX per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2], max_per_proc[AtomSum]);
print_mesg(smesg);
sprintf(smesg, " MIN per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2], min_per_proc[AtomSum]);
print_mesg(smesg);
}
sprintf(smesg, " AVG'd over %d time-steps", nTimes);
print_mesg(smesg);
sprintf(smesg, " AVG'ing took %g sec", time_local);
print_mesg(smesg);
#undef print_mesg
}
// Reset the counters.
for (int i = 0; i < numDiagnosticCounters; ++i)
diagnosticCounter[i] = 0;
return;
}
/* ---------------------------------------------------------------------- */