Misc Improvements to GPU Package

- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),

 template <class numtyp, class acctyp>
 int AnswerT::bytes_per_atom() const {
-  int bytes=11*sizeof(acctyp);
+  int bytes=10*sizeof(acctyp);
  if (_rot)
-    bytes+=4*sizeof(acctyp);
+    bytes+=3*sizeof(acctyp);
  if (_charge)
    bytes+=sizeof(acctyp);
  return bytes;
@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {

  bool success=true;

-  _ans_fields=4;
+  _ans_fields=3;
  if (_rot)
-    _ans_fields+=4;
+    _ans_fields+=3;

  // ---------------------------  Device allocations
  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
@ -134,11 +134,11 @@ void AnswerT::clear() {

 template <class numtyp, class acctyp>
 double AnswerT::host_memory_usage() const {
-  int atom_bytes=4;
+  int atom_bytes=3;
  if (_charge)
    atom_bytes+=1;
  if (_rot)
-    atom_bytes+=4;
+    atom_bytes+=3;
  int ans_bytes=atom_bytes+_ev_fields;
  return ans_bytes*(_max_local)*sizeof(acctyp)+
         sizeof(Answer<numtyp,acctyp>);
@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
  if (csize>0)
    engv.update_host(_ev_stride*csize,true);
  if (_rot)
-    force.update_host(_inum*4*2,true);
+    force.update_host(_inum*3*2,true);
  else
-    force.update_host(_inum*4,true);
+    force.update_host(_inum*3,true);
  time_answer.stop();

  #ifndef GERYON_OCL_FLUSH
@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
  if (_ilist==nullptr) {
-    typedef struct { double x,y,z; } vec3d;
-    typedef struct { acctyp x,y,z,w; } vec4d_t;
-    auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
-    auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
+    auto fp=reinterpret_cast<double*>(&(f[0][0]));

    #if (LAL_USE_OMP == 1)
    #pragma omp parallel
@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
      #if (LAL_USE_OMP == 1)
      const int nthreads = omp_get_num_threads();
      const int tid = omp_get_thread_num();
-      const int idelta = _inum / nthreads + 1;
+      const int idelta = _inum*3 / nthreads + 1;
      const int ifrom = tid * idelta;
-      const int ito = std::min(ifrom + idelta, _inum);
+      const int ito = std::min(ifrom + idelta, _inum*3);
      #else
      const int ifrom = 0;
-      const int ito = _inum;
+      const int ito = _inum*3;
      #endif

-      for (int i=ifrom; i<ito; i++) {
-        fp[i].x+=forcep[i].x;
-        fp[i].y+=forcep[i].y;
-        fp[i].z+=forcep[i].z;
-      }
+      for (int i=ifrom; i<ito; i++)
+        fp[i]+=force[i];
      if (_rot) {
-        auto torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
-        auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
-        for (int i=ifrom; i<ito; i++) {
-          torp[i].x+=torquep[i].x;
-          torp[i].y+=torquep[i].y;
-          torp[i].z+=torquep[i].z;
-        }
+        auto torp=reinterpret_cast<double*>(&(tor[0][0]));
+        auto torquep=&(force[_inum*3]);
+        for (int i=ifrom; i<ito; i++)
+          torp[i]+=torquep[i];
      }
    }
  } else {
@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
      const int idelta = _inum / nthreads + 1;
      const int ifrom = tid * idelta;
      const int ito = std::min(ifrom + idelta, _inum);
-      int fl=ifrom*4;
+      int fl=ifrom*3;
      #else
      const int ifrom = 0;
      const int ito = _inum;
@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
        f[ii][0]+=force[fl];
        f[ii][1]+=force[fl+1];
        f[ii][2]+=force[fl+2];
-        fl+=4;
+        fl+=3;
      }
      if (_rot) {
-        fl=_inum*4 + ifrom*4;
+        fl=_inum*3 + ifrom*3;
        for (int i=ifrom; i<ito; i++) {
          int ii=_ilist[i];
          tor[ii][0]+=force[fl];
          tor[ii][1]+=force[fl+1];
          tor[ii][2]+=force[fl+2];
-          fl+=4;
+          fl+=3;
        }
      }
    }