Enabled again neigh no with tpa > 1 for 3-body gpu styles for backward compatibility, could be slower than neigh no tpa 1 in many cases

2017-07-22 22:57:37 -05:00
parent 3d1d0c58c7
commit a71f5a0c20
6 changed files with 349 additions and 217 deletions
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -271,9 +271,8 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -284,14 +283,17 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
    itype=map[itype];

    // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
    int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);

    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -312,11 +314,14 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
      z = (acctyp)0;

      int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }

      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == j) continue;
@ -353,7 +358,8 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
      //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
      acc_zeta(z, tid, t_per_atom, offset_k);
@ -426,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -436,13 +442,16 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
    itype=map[itype];

    // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor];
-    nbor += n_stride;
-    nbor_end = nbor+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int j=dev_short_nbor[nbor];
+      int j=nbor_mem[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -536,7 +545,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -547,14 +556,17 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
    itype=map[itype];

    // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
    int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);

    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -578,7 +590,8 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
      //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
@ -602,11 +615,14 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
      }

      int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }

      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (j == k) continue;
@ -717,7 +733,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -730,13 +746,16 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
    numtyp tpainv = ucl_recip((numtyp)t_per_atom);

    // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }

    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -776,16 +795,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      }

      // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji: find i in the j's neighbor list
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -808,7 +829,8 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
@ -833,7 +855,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -875,7 +897,8 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
@ -957,7 +980,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -970,13 +993,16 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
    numtyp tpainv = ucl_recip((numtyp)t_per_atom);

    // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }

    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -1016,16 +1042,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      }

      // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -1048,7 +1076,8 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
@ -1073,7 +1102,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -1122,7 +1151,8 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
@ -1152,7 +1182,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
        virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
        virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
        virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
-
      }
    } // for nbor