Added Kokkos-like array datatype into RK4 and RHS in FixRXKokkos.

- Created an Array class that provides stride access for operator[] w/o needing Kokkos views. This was designed to avoid the performance issues encountered with Views and sub-views throughout the RHS and ODE solver functions.
2017-02-12 21:21:11 -05:00
parent 93d99ec8d0
commit 4ac7a5d1f2
2 changed files with 570 additions and 2 deletions
--- a/src/KOKKOS/fix_rx_kokkos.cpp
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@ -202,6 +202,373 @@ void FixRxKokkos<DeviceType>::rk4(const double t_stop, double *y, double *rwork,

 /* ---------------------------------------------------------------------- */

+template <typename DeviceType>
+  template <typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rk4(const double t_stop, double *y, double *rwork, UserDataType& userData) const
+{
+  double *k1 = rwork;
+  double *k2 = k1 + nspecies;
+  double *k3 = k2 + nspecies;
+  double *k4 = k3 + nspecies;
+  double *yp = k4 + nspecies;
+
+  const int numSteps = minSteps;
+
+  const double h = t_stop / double(numSteps);
+
+  // Run the requested steps with h.
+  for (int step = 0; step < numSteps; step++)
+  {
+    // k1
+    k_rhs(0.0,y,k1, userData);
+
+    // k2
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k1[ispecies];
+
+    k_rhs(0.0,yp,k2, userData);
+
+    // k3
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k2[ispecies];
+
+    k_rhs(0.0,yp,k3, userData);
+
+    // k4
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + h*k3[ispecies];
+
+    k_rhs(0.0,yp,k4, userData);
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);
+
+  } // end for (int step...
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+//     f1 = dt*f(t,x)
+//     f2 = dt*f(t+ c20*dt,x + c21*f1)
+//     f3 = dt*f(t+ c30*dt,x + c31*f1 + c32*f2)
+//     f4 = dt*f(t+ c40*dt,x + c41*f1 + c42*f2 + c43*f3)
+//     f5 = dt*f(t+dt,x + c51*f1 + c52*f2 + c53*f3 + c54*f4)
+//     f6 = dt*f(t+ c60*dt,x + c61*f1 + c62*f2 + c63*f3 + c64*f4 + c65*f5)
+//
+//     fifth-order runge-kutta integration
+//        x5 = x + b1*f1 + b3*f3 + b4*f4 + b5*f5 + b6*f6
+//     fourth-order runge-kutta integration
+//        x  = x + a1*f1 + a3*f3 + a4*f4 + a5*f5
+
+template <typename DeviceType>
+  template <typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, double y[], double y_out[], double rwk[], UserDataType& userData) const
+{
+   const double c21=0.25;
+   const double c31=0.09375;
+   const double c32=0.28125;
+   const double c41=0.87938097405553;
+   const double c42=-3.2771961766045;
+   const double c43=3.3208921256258;
+   const double c51=2.0324074074074;
+   const double c52=-8.0;
+   const double c53=7.1734892787524;
+   const double c54=-0.20589668615984;
+   const double c61=-0.2962962962963;
+   const double c62=2.0;
+   const double c63=-1.3816764132554;
+   const double c64=0.45297270955166;
+   const double c65=-0.275;
+   const double a1=0.11574074074074;
+   const double a3=0.54892787524366;
+   const double a4=0.5353313840156;
+   const double a5=-0.2;
+   const double b1=0.11851851851852;
+   const double b3=0.51898635477583;
+   const double b4=0.50613149034201;
+   const double b5=-0.18;
+   const double b6=0.036363636363636;
+
+   // local dependent variables (5 total)
+   double* f1 = &rwk[    0];
+   double* f2 = &rwk[  neq];
+   double* f3 = &rwk[2*neq];
+   double* f4 = &rwk[3*neq];
+   double* f5 = &rwk[4*neq];
+   double* f6 = &rwk[5*neq];
+
+   // scratch for the intermediate solution.
+   //double* ytmp = &rwk[6*neq];
+   double* ytmp = y_out;
+
+   // 1)
+   k_rhs (0.0, y, f1, userData);
+
+   for (int k = 0; k < neq; k++){
+      f1[k] *= h;
+      ytmp[k] = y[k] + c21 * f1[k];
+   }
+
+   // 2)
+   k_rhs(0.0, ytmp, f2, userData);
+
+   for (int k = 0; k < neq; k++){
+      f2[k] *= h;
+      ytmp[k] = y[k] + c31 * f1[k] + c32 * f2[k];
+   }
+
+   // 3)
+   k_rhs(0.0, ytmp, f3, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f3[k] *= h;
+      ytmp[k] = y[k] + c41 * f1[k] + c42 * f2[k] + c43 * f3[k];
+   }
+
+   // 4)
+   k_rhs(0.0, ytmp, f4, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f4[k] *= h;
+      ytmp[k] = y[k] + c51 * f1[k] + c52 * f2[k] + c53 * f3[k] + c54 * f4[k];
+   }
+
+   // 5)
+   k_rhs(0.0, ytmp, f5, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f5[k] *= h;
+      ytmp[k] = y[k] + c61*f1[k] + c62*f2[k] + c63*f3[k] + c64*f4[k] + c65*f5[k];
+   }
+
+   // 6)
+   k_rhs(0.0, ytmp, f6, userData);
+
+   for (int k = 0; k < neq; k++)
+   {
+      //const double f6 = h * ydot[k];
+      f6[k] *= h;
+
+      // 5th-order solution.
+      const double r5 = b1*f1[k] + b3*f3[k] + b4*f4[k] + b5*f5[k] + b6*f6[k];
+
+      // 4th-order solution.
+      const double r4 = a1*f1[k] + a3*f3[k] + a4*f4[k] + a5*f5[k];
+
+      // Truncation error: difference between 4th and 5th-order solutions.
+      rwk[k] = fabs(r5 - r4);
+
+      // Update solution.
+    //y_out[k] = y[k] + r5; // Local extrapolation
+      y_out[k] = y[k] + r4;
+   }
+
+   return;
+}
+
+template <typename DeviceType>
+  template <typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rkf45_h0
+                    (const int neq, const double t, const double t_stop,
+                     const double hmin, const double hmax,
+                     double& h0, double y[], double rwk[], UserDataType& userData) const
+{
+   // Set lower and upper bounds on h0, and take geometric mean as first trial value.
+   // Exit with this value if the bounds cross each other.
+
+   // Adjust upper bound based on ydot ...
+   double hg = sqrt(hmin*hmax);
+
+   //if (hmax < hmin)
+   //{
+   //   h0 = hg;
+   //   return;
+   //}
+
+   // Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1
+
+   double *ydot  = rwk;
+   double *y1    = ydot + neq;
+   double *ydot1 = y1 + neq;
+
+   const int max_iters = 10;
+   bool hnew_is_ok = false;
+   double hnew = hg;
+   int iter = 0;
+
+   // compute ydot at t=t0
+   k_rhs (t, y, ydot, userData);
+
+   while(1)
+   {
+      // Estimate y'' with finite-difference ...
+
+      for (int k = 0; k < neq; k++)
+         y1[k] = y[k] + hg * ydot[k];
+
+      // compute y' at t1
+      k_rhs (t + hg, y1, ydot1, userData);
+
+      // Compute WRMS norm of y''
+      double yddnrm = 0.0;
+      for (int k = 0; k < neq; k++){
+         double ydd = (ydot1[k] - ydot[k]) / hg;
+         double wterr = ydd / (relTol * fabs( y[k] ) + absTol);
+         yddnrm += wterr * wterr;
+      }
+
+      yddnrm = sqrt( yddnrm / double(neq) );
+
+      //std::cout << "iter " << _iter << " hg " << hg << " y'' " << yddnrm << std::endl;
+      //std::cout << "ydot " << ydot[neq-1] << std::endl;
+
+      // should we accept this?
+      if (hnew_is_ok || iter == max_iters){
+         hnew = hg;
+         if (iter == max_iters)
+            fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
+         break;
+      }
+
+      // Get the new value of h ...
+      hnew = (yddnrm*hmax*hmax > 2.0) ? sqrt(2.0 / yddnrm) : sqrt(hg * hmax);
+
+      // test the stopping conditions.
+      double hrat = hnew / hg;
+
+      // Accept this value ... the bias factor should bring it within range.
+      if ( (hrat > 0.5) && (hrat < 2.0) )
+         hnew_is_ok = true;
+
+      // If y'' is still bad after a few iterations, just accept h and give up.
+      if ( (iter > 1) && hrat > 2.0 ) {
+         hnew = hg;
+         hnew_is_ok = true;
+      }
+
+      //printf("iter=%d, yddnrw=%e, hnew=%e, hmin=%e, hmax=%e\n", iter, yddnrm, hnew, hmin, hmax);
+
+      hg = hnew;
+      iter ++;
+   }
+
+   // bound and bias estimate
+   h0 = hnew * 0.5;
+   h0 = fmax(h0, hmin);
+   h0 = fmin(h0, hmax);
+   //printf("h0=%e, hmin=%e, hmax=%e\n", h0, hmin, hmax);
+
+   return (iter + 1);
+}
+
+template <typename DeviceType>
+  template <typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45(const int neq, const double t_stop, double *y, double *rwork, UserDataType& userData, CounterType& counter) const
+{
+  // Rounding coefficient.
+  const double uround = DBL_EPSILON;
+
+  // Adaption limit (shrink or grow)
+  const double adaption_limit = 4.0;
+
+  // Safety factor on the adaption. very specific but not necessary .. 0.9 is common.
+  const double hsafe = 0.840896415;
+
+  // Time rounding factor.
+  const double tround = t_stop * uround;
+
+  // Counters for diagnostics.
+  int nst = 0; // # of steps (accepted)
+  int nit = 0; // # of iterations total
+  int nfe = 0; // # of RHS evaluations
+
+  // Min/Max step-size limits.
+  const double h_min = 100.0 * tround;
+  const double h_max = (minSteps > 0) ? t_stop / double(minSteps) : t_stop;
+
+  // Set the initial step-size. 0 forces an internal estimate ... stable Euler step size.
+  double h = (minSteps > 0) ? t_stop / double(minSteps) : 0.0;
+
+  double t = 0.0;
+
+  if (h < h_min){
+    //fprintf(stderr,"hin not implemented yet\n");
+    //exit(-1);
+    nfe = k_rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, rwork, userData);
+  }
+
+  //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
+
+  // Integrate until we reach the end time.
+  while (fabs(t - t_stop) > tround){
+    double *yout = rwork;
+    double *eout = yout + neq;
+
+    // Take a trial step.
+    k_rkf45_step (neq, h, y, yout, eout, userData);
+
+    // Estimate the solution error.
+      // ... weighted 2-norm of the error.
+      double err2 = 0.0;
+      for (int k = 0; k < neq; k++){
+        const double wterr = eout[k] / (relTol * fabs( y[k] ) + absTol);
+        err2 += wterr * wterr;
+      }
+
+    double err = fmax( uround, sqrt( err2 / double(nspecies) ));
+
+    // Accept the solution?
+    if (err <= 1.0 || h <= h_min){
+      t += h;
+      nst++;
+
+      for (int k = 0; k < neq; k++)
+        y[k] = yout[k];
+    }
+
+    // Adjust h for the next step.
+    double hfac = hsafe * sqrt( sqrt( 1.0 / err ) );
+
+    // Limit the adaption.
+    hfac = fmax( hfac, 1.0 / adaption_limit );
+    hfac = fmin( hfac,       adaption_limit );
+
+    // Apply the adaption factor...
+    h *= hfac;
+
+    // Limit h.
+    h = fmin( h, h_max );
+    h = fmax( h, h_min );
+
+    // Stretch h if we're within 5% ... and we didn't just fail.
+    if (err <= 1.0 && (t + 1.05*h) > t_stop)
+      h = t_stop - t;
+
+    // And don't overshoot the end.
+    if (t + h > t_stop)
+      h = t_stop - t;
+
+    nit++;
+    nfe += 6;
+
+    if (maxIters && nit > maxIters){
+      //fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
+      counter.nFails ++;
+      break;
+      // We should set an error here so that the solution is not used!
+    }
+
+  } // end while
+
+  counter.nSteps += nst;
+  counter.nIters += nit;
+  counter.nFuncs += nfe;
+
+  //printf("id= %d nst= %d nit= %d\n", id, nst, nit);
+}
+/* ---------------------------------------------------------------------- */
+
 //     f1 = dt*f(t,x)
 //     f2 = dt*f(t+ c20*dt,x + c21*f1)
 //     f3 = dt*f(t+ c30*dt,x + c31*f1 + c32*f2)
@ -664,6 +1031,152 @@ int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt,

 /* ---------------------------------------------------------------------- */

+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+  //StridedArrayType<double,1> _y( const_cast<double *>( y ) ), _dydt( dydt );
+
+  // Use the sparse format instead.
+  if (useSparseKinetics)
+    return this->k_rhs_sparse( t, y, dydt, userData);
+  else
+    return this->k_rhs_dense ( t, y, dydt, userData);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs_dense(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+  #define rxnRateLaw (userData.rxnRateLaw)
+  #define kFor       (userData.kFor      )
+
+  //const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  //const int nspecies = atom->nspecies_dpd;
+
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    dydt[ispecies] = 0.0;
+
+  // Construct the reaction rate laws
+  for(int jrxn=0; jrxn<nreactions; jrxn++){
+    double rxnRateLawForward = kFor[jrxn];
+
+    for(int ispecies=0; ispecies<nspecies; ispecies++){
+      const double concentration = y[ispecies]/VDPD;
+      rxnRateLawForward *= pow( concentration, d_kineticsData.stoichReactants(jrxn,ispecies) );
+      //rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
+    }
+    rxnRateLaw[jrxn] = rxnRateLawForward;
+  }
+
+  // Construct the reaction rates for each species
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    for(int jrxn=0; jrxn<nreactions; jrxn++)
+    {
+      dydt[ispecies] += d_kineticsData.stoich(jrxn,ispecies) *VDPD*rxnRateLaw[jrxn];
+      //dydt[ispecies] += stoich[jrxn][ispecies]*VDPD*rxnRateLaw[jrxn];
+    }
+
+  #undef rxnRateLaw
+  #undef kFor
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs_sparse(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+   #define kFor         (userData.kFor)
+   #define kRev         (NULL)
+   #define rxnRateLaw   (userData.rxnRateLaw)
+   #define conc         (dydt)
+   #define maxReactants (this->sparseKinetics_maxReactants)
+   #define maxSpecies   (this->sparseKinetics_maxSpecies)
+   #define nuk          (this->d_kineticsData.nuk)
+   #define nu           (this->d_kineticsData.nu)
+   #define inu          (this->d_kineticsData.inu)
+   #define isIntegral(idx) ( SparseKinetics_enableIntegralReactions \
+                             && this->d_kineticsData.isIntegral(idx) )
+
+   for (int k = 0; k < nspecies; ++k)
+      conc[k] = y[k] / VDPD;
+
+   // Construct the reaction rate laws
+   for (int i = 0; i < nreactions; ++i)
+   {
+      double rxnRateLawForward;
+      if (isIntegral(i)){
+         rxnRateLawForward = kFor[i] * powint( conc[ nuk(i,0) ], inu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= powint( conc[k], inu(i,kk) );
+         }
+      } else {
+         rxnRateLawForward = kFor[i] * pow( conc[ nuk(i,0) ], nu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= pow( conc[k], nu(i,kk) );
+         }
+      }
+
+      rxnRateLaw[i] = rxnRateLawForward;
+   }
+
+   // Construct the reaction rates for each species from the
+   // Stoichiometric matrix and ROP vector.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] = 0.0;
+
+   for (int i = 0; i < nreactions; ++i){
+      // Reactants ...
+      dydt[ nuk(i,0) ] -= nu(i,0) * rxnRateLaw[i];
+      for (int kk = 1; kk < maxReactants; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] -= nu(i,kk) * rxnRateLaw[i];
+      }
+
+      // Products ...
+      dydt[ nuk(i,maxReactants) ] += nu(i,maxReactants) * rxnRateLaw[i];
+      for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] += nu(i,kk) * rxnRateLaw[i];
+      }
+   }
+
+   // Add in the volume factor to convert to the proper units.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] *= VDPD;
+
+   #undef kFor
+   #undef kRev
+   #undef rxnRateLaw
+   #undef conc
+   #undef maxReactants
+   #undef maxSpecies
+   #undef nuk
+   #undef nu
+   #undef inu
+   #undef isIntegral
+   //#undef invalidIndex
+
+   return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
 /*template <typename DeviceType>
  template <typename SolverType>
    KOKKOS_INLINE_FUNCTION
@ -907,6 +1420,10 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
        userData.kFor = new double[nreactions];
        userData.rxnRateLaw = new double[nreactions];

+        UserRHSDataKokkos<1> userDataKokkos;
+        userDataKokkos.kFor.m_data = userData.kFor;
+        userDataKokkos.rxnRateLaw.m_data = userData.rxnRateLaw;
+
        CounterType counter_i;

        const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);
@ -935,7 +1452,8 @@ void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreF
        // Solver the ODE system.
        if (odeIntegrationFlag == ODE_LAMMPS_RK4)
        {
-          rk4(t_stop, y, rwork, &userData);
+          //rk4(t_stop, y, rwork, &userData);
+          k_rk4(t_stop, y, rwork, userDataKokkos);
        }
        else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
        {