complete the implementation of respa/omp. tested ok.

2013-02-23 18:27:33 +01:00
parent ba8c0bb5bc
commit d2fef46fb3
4 changed files with 94 additions and 35 deletions
--- a/src/USER-OMP/fix_omp.cpp
+++ b/src/USER-OMP/fix_omp.cpp
@ -68,7 +68,7 @@ static int get_tid()
 FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg) 
  :  Fix(lmp, narg, arg),
     thr(NULL), last_omp_style(NULL), last_pair_hybrid(NULL),
-     _nthr(-1), _neighbor(true), _mixed(false)
+     _nthr(-1), _neighbor(true), _mixed(false), _reduced(true)
 {
  if ((narg < 4) || (narg > 7)) error->all(FLERR,"Illegal package omp command");
  if (strcmp(arg[1],"all") != 0) error->all(FLERR,"fix OMP has to operate on group 'all'");
@ -87,8 +87,10 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg)
  if (nthreads < 1)
    error->all(FLERR,"Illegal number of OpenMP threads requested");

+  int reset_thr = 0;
  if (nthreads != comm->nthreads) {
 #if defined(_OPENMP)
+    reset_thr = 1;
    omp_set_num_threads(nthreads);
 #endif
    comm->nthreads = nthreads;
@ -110,25 +112,21 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg)

  // print summary of settings
  if (comm->me == 0) {
-    const char * const nmode = _neighbor ? "OpenMP capable" : "serial";
+    const char * const nmode = _neighbor ? "multi-threaded" : "serial";
    const char * const kmode = _mixed ? "mixed" : "double";

    if (screen) {
-      fprintf(screen,"  reset %d OpenMP thread(s) per MPI task\n", nthreads);
-      fprintf(screen,"  using %s neighbor list subroutines\n", nmode);
-      if (_mixed)
-        fputs("  using mixed precision OpenMP force kernels where available\n", screen);
-      else
-        fputs("  using double precision OpenMP force kernels\n", screen);
+      if (reset_thr)
+	fprintf(screen,"set %d OpenMP thread(s) per MPI task\n", nthreads);
+      fprintf(screen,"using %s neighbor list subroutines\n", nmode);
+      fprintf(screen,"prefer %s precision OpenMP force kernels\n", kmode);
    }
    
    if (logfile) {
-      fprintf(logfile,"  reset %d OpenMP thread(s) per MPI task\n", nthreads);
-      fprintf(logfile,"  using %s neighbor list subroutines\n", nmode);
-      if (_mixed)
-        fputs("  using mixed precision OpenMP force kernels where available\n", logfile);
-      else
-        fputs("  using double precision OpenMP force kernels\n", logfile);
+      if (reset_thr)
+	fprintf(logfile,"set %d OpenMP thread(s) per MPI task\n", nthreads);
+      fprintf(logfile,"using %s neighbor list subroutines\n", nmode);
+      fprintf(logfile,"prefer %s precision OpenMP force kernels\n", kmode);
    }
  }

@ -137,7 +135,6 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg)
  // encourage the OS to use storage that is "close" to each thread's CPU.
  thr = new ThrData *[nthreads];
  _nthr = nthreads;
-  _clearforce = true;
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
@ -283,7 +280,6 @@ void FixOMP::init()
        fprintf(logfile,"Last active /omp style is %s_style %s\n",
                last_force_name, last_omp_name);
    } else {
-      _clearforce = false;
      if (screen)
        fprintf(screen,"No /omp style for force computation currently active\n");
      if (logfile)
@ -323,18 +319,16 @@ void FixOMP::pre_force(int)
  double *de = atom->de;
  double *drho = atom->drho;

-  if (_clearforce) {
 #if defined(_OPENMP)
 #pragma omp parallel default(none) shared(f,torque,erforce,de,drho)
 #endif
-    {
-      const int tid = get_tid();
-      thr[tid]->check_tid(tid);
-      thr[tid]->init_force(nall,f,torque,erforce,de,drho);
-    }
-  } else {
-    thr[0]->init_force(nall,f,torque,erforce,de,drho);
-  }
+  {
+    const int tid = get_tid();
+    thr[tid]->check_tid(tid);
+    thr[tid]->init_force(nall,f,torque,erforce,de,drho);
+  } // end of omp parallel region
+
+  _reduced = false;
 }

 /* ---------------------------------------------------------------------- */
--- a/src/USER-OMP/fix_omp.h
+++ b/src/USER-OMP/fix_omp.h
@ -29,6 +29,7 @@ class ThrData;

 class FixOMP : public Fix {
  friend class ThrOMP;
+  friend class RespaOMP;

 public:
  FixOMP(class LAMMPS *, int, char **);
@ -51,6 +52,8 @@ class FixOMP : public Fix {
                           // to do the general force reduction
  void *last_pair_hybrid;  // pointer to the pair style that needs
                           // to call virial_fdot_compute()
+  // signal that an /omp style did the force reduction. needed by respa/omp
+  void did_reduce() { _reduced = true;  }

 public:
  ThrData *get_thr(int tid) { return thr[tid];  }
@ -58,12 +61,13 @@ class FixOMP : public Fix {

  bool get_neighbor() const { return _neighbor; }
  bool get_mixed()    const { return _mixed;    }
+  bool get_reduced()  const { return _reduced;  }

 private:
-  int  _nthr;     // number of currently active ThrData object
-  bool _neighbor; // en/disable threads for neighbor list construction
-  bool _clearforce; // whether to clear per thread data objects
-  bool _mixed;     // whether to use a mixed precision compute kernel
+  int  _nthr;       // number of currently active ThrData objects
+  bool _neighbor;   // en/disable threads for neighbor list construction
+  bool _mixed;      // whether to prefer mixed precision compute kernels
+  bool _reduced;    // whether forces have been reduced for this step

  void set_neighbor_omp();
 };
--- a/src/USER-OMP/respa_omp.cpp
+++ b/src/USER-OMP/respa_omp.cpp
@ -38,6 +38,10 @@
 #include "memory.h"
 #include "error.h"

+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
 using namespace LAMMPS_NS;

 /* ---------------------------------------------------------------------- */
@ -55,7 +59,7 @@ void RespaOMP::init()
 {
  Respa::init();

-  if (lmp->atom->torque)
+  if (atom->torque)
    error->all(FLERR,"Extended particles are not supported by respa/omp\n");
  
  // set newton flag for each level
@ -137,6 +141,25 @@ void RespaOMP::setup()
      force->kspace->setup();
      if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
    }
+
+    // reduce forces from per-thread arrays, if needed
+    if (!fix->get_reduced()) {
+      const int nall = atom->nlocal + atom->nghost;
+      const int nthreads = comm->nthreads;
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+      {
+#if defined(_OPENMP)
+	int tid = omp_get_thread_num();
+#else
+	int tid = 0;
+#endif
+	data_reduce_thr(atom->f[0], nall, nthreads, 3, tid);
+      }
+      fix->did_reduce();
+    }
+      
    if (newton[ilevel]) comm->reverse_comm();
    copy_f_flevel(ilevel);
  }
@ -205,6 +228,25 @@ void RespaOMP::setup_minimal(int flag)
      force->kspace->setup();
      if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
    }
+
+    // reduce forces from per-thread arrays, if needed
+    if (!fix->get_reduced()) {
+      const int nall = atom->nlocal + atom->nghost;
+      const int nthreads = comm->nthreads;
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+      {
+#if defined(_OPENMP)
+	int tid = omp_get_thread_num();
+#else
+	int tid = 0;
+#endif
+	data_reduce_thr(atom->f[0], nall, nthreads, 3, tid);
+      }
+      fix->did_reduce();
+    }
+
    if (newton[ilevel]) comm->reverse_comm();
    copy_f_flevel(ilevel);
  }
@ -315,6 +357,24 @@ void RespaOMP::recurse(int ilevel)
      timer->stamp(Timer::KSPACE);
    }

+    // reduce forces from per-thread arrays, if needed
+    if (!fix->get_reduced()) {
+      const int nall = atom->nlocal + atom->nghost;
+      const int nthreads = comm->nthreads;
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+      {
+#if defined(_OPENMP)
+	int tid = omp_get_thread_num();
+#else
+	int tid = 0;
+#endif
+	data_reduce_thr(atom->f[0], nall, nthreads, 3, tid);
+      }
+      fix->did_reduce();
+    }
+
    if (newton[ilevel]) {
      comm->reverse_comm();
      timer->stamp(Timer::COMM);
--- a/src/USER-OMP/thr_omp.cpp
+++ b/src/USER-OMP/thr_omp.cpp
@ -183,6 +183,7 @@ void ThrOMP::reduce_thr(void *style, const int eflag, const int vflag,
          // pair_style hybrid will compute fdotr for us
          // but we first need to reduce the forces
          data_reduce_thr(&(f[0][0]), nall, nthreads, 3, tid);
+	  fix->did_reduce();
          need_force_reduce = 0;
        }
      }
@ -380,13 +381,11 @@ void ThrOMP::reduce_thr(void *style, const int eflag, const int vflag,
    break;

  case THR_KSPACE:
-    // nothing to do. XXX need to add support for per-atom info
+    // nothing to do. XXX may need to add support for per-atom info
    break;

  case THR_INTGR:
-    // when running and /omp enabled integrator class.
-    // the reduction is run explicitly by the integrator
-    need_force_reduce = 0;
+    // nothing to do
    break;

  default:
@ -395,8 +394,10 @@ void ThrOMP::reduce_thr(void *style, const int eflag, const int vflag,
  }

  if (style == fix->last_omp_style) {
-    if (need_force_reduce)
+    if (need_force_reduce) {
      data_reduce_thr(&(f[0][0]), nall, nthreads, 3, tid);
+      fix->did_reduce();
+    }

    if (lmp->atom->torque)
      data_reduce_thr(&(lmp->atom->torque[0][0]), nall, nthreads, 3, tid);