Misc Improvements to GPU Package

- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -52,7 +52,7 @@ _texture_2d( pos_tex,int4);
  compute the id of the cell where the atoms belong to
 x: atom coordinates
 cell_id: cell ids
-particle_id: 
+particle_id:
 boxlo[0-2]: the lower left corner of the local box
 ncell[xyz]: the number of cells in xyz dims
 i_cell_size is the inverse cell size
@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,

 #endif

+#define SPECIAL_DATA_PRELOAD_SIZE 3
+#define UNROLL_FACTOR_LIST 4
+#define UNROLL_FACTOR_SPECIAL 2
+
 __kernel void kernel_special(__global int *dev_nbor,
                             __global int *host_nbor_list,
                             const __global int *host_numj,
@ -526,23 +530,68 @@ __kernel void kernel_special(__global int *dev_nbor,
      list_end=list+fast_mul(numj,stride);
    }

-    for ( ; list<list_end; list+=stride) {
-      int nbor=*list;
-      tagint jtag=tag[nbor];
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+    tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
+    for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
+      special_preload[j] = special[ii + i*nt];
+    }
+#endif

-      int offset=ii;
-      for (int i=0; i<n3; i++) {
-        if (special[offset]==jtag) {
-          int which = 1;
-          if (i>=n1)
-            which++;
-          if (i>=n2)
-            which++;
-          nbor=nbor ^ (which << SBBITS);
-          *list=nbor;
+    for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
+      int nbor[UNROLL_FACTOR_LIST];
+      tagint jtag[UNROLL_FACTOR_LIST];
+      __global int* list_addr[UNROLL_FACTOR_LIST];
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        list_addr[l] = list + l*stride;
+        nbor[l] = *list_addr[l];
+      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        jtag[l] = tag[nbor[l]];
+      }
+
+      for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
+        tagint special_data[UNROLL_FACTOR_SPECIAL];
+        int which[UNROLL_FACTOR_SPECIAL];
+
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          which[c] = 1;
+          if (i + c < n3)
+          {
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+            if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
+              special_data[c] = special_preload[j];
+            }
+            else
+#endif
+              special_data[c] = special[ii + (i+c)*nt];
+          }
        }
-        offset+=nt;
+
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n1) {
+            which[k]++;
+          }
+        }
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n2) {
+            which[k]++;
+          }
+          which[k] <<= SBBITS;
+        }
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          if (i + c < n3) {
+            for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+              if (special_data[c] == jtag[l]) {
+                nbor[l]=nbor[l] ^ which[c];
+              }
+            }
+          }
+        }
+      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        *list_addr[l] = nbor[l];
      }
    }
  } // if ii
 }
+