Misc Improvements to GPU Package

- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -32,7 +32,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -48,7 +48,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -123,7 +124,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -144,7 +145,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -166,6 +167,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];