Misc Improvements to GPU Package

- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -86,7 +86,7 @@
        ap1+=astride;                                                        \
      }                                                                      \
    }                                                                        \
-    acctyp4 old=ans[ii];                                                     \
+    acctyp3 old=ans[ii];                                                     \
    old.x+=f.x;                                                              \
    old.y+=f.y;                                                              \
    old.z+=f.z;                                                              \
@ -131,7 +131,7 @@
        ap1+=astride;                                                       \
      }                                                                     \
    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
                                           const int ntypes,
                                           const __global int *dev_nbor,
                                           const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                           const int astride,
                                           __global acctyp *restrict engv,
                                           __global int *restrict err_flag,
@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
  const numtyp solv_f_r =
     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, virial[6];
@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -409,7 +410,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                           const int ntypes,
                                           const __global int *dev_nbor,
                                           const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                           __global acctyp *restrict engv,
                                           __global int *restrict err_flag,
                                           const int eflag, const int vflag,
@ -435,7 +436,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
  const numtyp solv_f_r =
    (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -454,6 +455,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int i=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(i)];
      i &= NEIGHMASK;
@ -610,7 +612,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict gum,
                             const int stride,
                             const __global int *dev_ij,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             __global int *restrict err_flag,
                             const int eflag, const int vflag, const int start,
@ -628,7 +630,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[2];
  sp_lj[3]=gum[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -647,6 +649,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -697,7 +700,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict gum,
                                  const int stride,
                                  const __global int *dev_ij,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  __global int *restrict err_flag,
                                  const int eflag, const int vflag,
@ -721,7 +724,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -743,6 +746,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];