Feb2021 GPU Package Update - GPU Package Files

2021-02-15 08:20:50 -08:00
parent 16004e8f45
commit e7e2d2323b
345 changed files with 13424 additions and 7708 deletions
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -24,6 +24,9 @@
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
+#ifndef LAL_NO_OCL_EV_JIT
+#define LAL_OCL_EV_JIT
+#endif
 #elif defined(USE_CUDART)
 #include "geryon/nvc_timer.h"
 #include "geryon/nvc_mat.h"
@ -178,7 +181,7 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

@ -197,7 +200,26 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
+    ucl_copy(dev_v,view,false);
+  }
+
+  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
+  template <class dev_typ, class t1, class t2>
+  inline void type_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
+                         UCL_H_Vec<numtyp> &buffer, t1 ***one, t2 ***two) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        for (int k=0; k<n; k++) {
+          buffer[ii*2]=static_cast<numtyp>(one[i][j][k]);
+          buffer[ii*2+1]=static_cast<numtyp>(two[i][j][k]);
+          ii++;
+        }
+      }
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view_offset(0,buffer,n*n*n);
    ucl_copy(dev_v,view,false);
  }

@ -217,7 +239,7 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

@ -238,7 +260,7 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

@ -251,7 +273,7 @@ class Atom {
      buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),n,*dev);
+    view.view_offset(0,buffer,n);
    ucl_copy(dev_v,view,false);
  }

@ -261,6 +283,9 @@ class Atom {
  inline void data_unavail()
    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }

+  typedef struct { double x,y,z; } vec3d;
+  typedef struct { numtyp x,y,z,w; } vec4d_t;
+
  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
    if (_x_avail==false) {
@ -269,13 +294,16 @@ class Atom {
      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
      #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
      for (int i=0; i<_nall; i++) {
-        x[wl]=host_ptr[i][0];
-        x[wl+1]=host_ptr[i][1];
-        x[wl+2]=host_ptr[i][2];
-        x[wl+3]=host_type[i];
-        wl+=4;
+        xp[i].x=host_p[i].x;
+        xp[i].y=host_p[i].y;
+        xp[i].z=host_p[i].z;
+        xp[i].w=host_type[i];
      }
      #endif
      _time_cast+=MPI_Wtime()-t;
@ -320,6 +348,11 @@ class Atom {
      } else if (sizeof(numtyp)==sizeof(double))
        memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
        for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
@ -346,6 +379,11 @@ class Atom {
      } else if (sizeof(numtyp)==sizeof(double))
        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
@ -370,13 +408,16 @@ class Atom {
      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
      #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
      for (int i=0; i<_nall; i++) {
-        v[wl]=host_ptr[i][0];
-        v[wl+1]=host_ptr[i][1];
-        v[wl+2]=host_ptr[i][2];
-        v[wl+3]=host_tag[i];
-        wl+=4;
+        vp[i].x=host_p[i].x;
+        vp[i].y=host_p[i].y;
+        vp[i].z=host_p[i].z;
+        vp[i].w=host_tag[i];
      }
      #endif
      _time_cast+=MPI_Wtime()-t;