Fixed bugs with _tep and _fieldp to allow mixed-precision builds, being defensive with acctyp for these variables

2021-09-20 11:38:50 -05:00
parent 0228867d8e
commit 4e88cd158e
3 changed files with 72 additions and 72 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -102,7 +102,7 @@ _texture( q_tex,int2);
    dufld[5]=red_acc[5][tid];                                               \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    numtyp4 t;                                                              \
+    acctyp4 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -136,7 +136,7 @@ _texture( q_tex,int2);
    _fieldp[5]=red_acc[5][tid];                                             \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    numtyp4 f, fp;                                                          \
+    acctyp4 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -243,7 +243,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    numtyp4 t;                                                              \
+    acctyp4 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -266,7 +266,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    numtyp4 f, fp;                                                          \
+    acctyp4 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -591,7 +591,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                 const __global int *dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
-                                 __global numtyp4 *restrict tep,
+                                 __global acctyp4 *restrict tep,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
                                 const int t_per_atom, const numtyp aewald,
@ -883,7 +883,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global numtyp4 *restrict fieldp,
+                                 __global acctyp4 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@ -1097,7 +1097,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global numtyp4 *restrict fieldp,
+                                 __global acctyp4 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@ -1256,55 +1256,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }

-/* ----------------------------------------------------------------------
-   scan standard neighbor list and make it compatible with 1-5 neighbors
-   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
-   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
-   else do nothing to IJ entry
------------------------------------------------------------------------- */
-
-__kernel void k_special15(__global int * dev_nbor,
-                          const __global int * dev_packed,
-                          const __global tagint *restrict tag,
-                          const __global int *restrict nspecial15,
-                          const __global tagint *restrict special15,
-                          const int inum, const int nall, const int nbor_pitch,
-                          const int t_per_atom) {
-  int tid, ii, offset, n_stride, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  if (ii<inum) {
-  
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    int n15 = nspecial15[ii];
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int sj=dev_packed[nbor];
-      int which = sj >> SBBITS & 3;
-      int j = sj & NEIGHMASK;
-      tagint jtag = tag[j];
-
-      if (!which) {
-        int offset=ii;
-        for (int k=0; k<n15; k++) {
-          if (special15[offset] == jtag) {
-            which = 4;
-            break;
-          }
-          offset += nall;
-        }
-      }
-
-      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
-    } // for nbor
-
-  } // if ii
-}
-
 /* ----------------------------------------------------------------------
   polar_real = real-space portion of induced dipole polarization
   adapted from Tinker epreal1d() routine
@ -1319,7 +1270,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global int *dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
-                            __global numtyp4 *restrict tep,
+                             __global acctyp4 *restrict tep,
                             const int eflag, const int vflag, const int inum,
                             const int nall, const int nbor_pitch, const int t_per_atom,
                             const numtyp aewald, const numtyp felec,
@ -1828,6 +1779,55 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }

+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -52,7 +52,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
  int gpu_rank=AMOEBAMF.device->gpu_rank();
  int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();

-  tep_size=sizeof(PRECISION);
+  tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION);

  AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);

--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -235,7 +235,7 @@ class BaseAmoeba {
    double** uind, double** uinp);

  /// Per-atom arrays
-  UCL_Vector<numtyp,numtyp> _tep, _fieldp;
+  UCL_Vector<acctyp,acctyp> _tep, _fieldp;
  int _nmax, _max_tep_size, _max_fieldp_size;

  // ------------------------ FORCE/ENERGY DATA -----------------------