Separate the computation of newsite into another kernel

This eliminates the need for thread fence and makes the calculation stable on GTX1070 (CUDA and OpenCL) and TitanV
2019-12-07 15:54:20 +03:00
parent 4a51e1660f
commit a3fca53e97
3 changed files with 51 additions and 23 deletions
--- a/lib/gpu/lal_lj_tip4p_long.cpp
+++ b/lib/gpu/lal_lj_tip4p_long.cpp
@ -66,6 +66,7 @@ int LJTIP4PLongT::init(const int ntypes,
    return success;
  k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");
  k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh");
+  k_pair_newsite.set_function(*this->pair_program,"k_lj_tip4p_newsite");

  TypeH = tH;
  TypeO = tO;
@ -163,6 +164,7 @@ void LJTIP4PLongT::clear() {

  k_pair_distrib.clear();
  k_pair_reneigh.clear();
+  k_pair_newsite.clear();

  this->clear_atomic();
 }
@ -195,9 +197,8 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  int GX;
-
+  GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
  if (t_ago == 0) {
-    GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
    this->k_pair_reneigh.set_size(GX,BX);
    this->k_pair_reneigh.run(&this->atom->x,
        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
@ -205,6 +206,14 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
        &hneight, &m, &TypeO, &TypeH,
        &tag, &map_array, &atom_sametag);
  }
+  this->k_pair_newsite.set_size(GX,BX);
+  this->k_pair_newsite.run(&this->atom->x,
+          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+          &nall, &ainum,
+          &nbor_pitch, &this->_threads_per_atom,
+          &hneight, &m, &TypeO, &TypeH, &alpha,
+          &this->atom->q, &tag, &map_array,
+          &atom_sametag);

  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@ -65,11 +65,12 @@ ucl_inline int closest_image(int i, int j, const __global int* sametag,
 }

 ucl_inline void compute_newsite(int iO, int  iH1, int  iH2,
-    __global numtyp4 *xM,
+    __global numtyp4 *xM, numtyp q,
    numtyp alpha, const __global numtyp4 *restrict x_){
  numtyp4 xO;  fetch4(xO,iO,pos_tex);
  numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
  numtyp4 xH2; fetch4(xH2,iH2,pos_tex);
+  numtyp4 M;

  numtyp delx1 = xH1.x - xO.x;
  numtyp dely1 = xH1.y - xO.y;
@ -81,9 +82,12 @@ ucl_inline void compute_newsite(int iO, int  iH1, int  iH2,

  numtyp ap = alpha * (numtyp)0.5;

-  (*xM).x = xO.x + ap * (delx1 + delx2);
-  (*xM).y = xO.y + ap * (dely1 + dely2);
-  (*xM).z = xO.z + ap * (delz1 + delz2);
+  M.x = xO.x + ap * (delx1 + delx2);
+  M.y = xO.y + ap * (dely1 + dely2);
+  M.z = xO.z + ap * (delz1 + delz2);
+  M.w = q;
+
+  *xM = M;
 }

 __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
@ -214,6 +218,37 @@ __kernel void k_lj_tip4p_reneigh(const __global numtyp4 *restrict x_,
 }


+__kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
+    const __global int * dev_nbor,
+    const __global int * dev_packed,
+    const int nall, const int inum,
+    const int nbor_pitch, const int t_per_atom,
+    __global int *restrict hneigh,
+    __global numtyp4 *restrict m,
+    const int typeO, const int typeH,
+    const numtyp alpha, const __global numtyp *restrict q_,
+    const __global int *restrict tag, const __global int *restrict map,
+    const __global int *restrict sametag) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+  int i = BLOCK_ID_X*(BLOCK_SIZE_X)+tid;
+
+  if (i<nall) {
+    int iO, iH1, iH2;
+    iO  = i;
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype = ix.w;
+    if (itype == typeO){
+      int iH1, iH2, iO;
+      iH1 = hneigh[i*4  ];
+      iH2 = hneigh[i*4+1];
+      iO  = i;
+      numtyp qO; fetch(qO,iO,q_tex);
+      compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
+    }
+  }
+}
+
 __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
    const __global numtyp4 *restrict lj1,
    const __global numtyp4 *restrict lj3,
@ -269,11 +304,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
      iO  = i;
      iH1 = hneigh[i*4  ];
      iH2 = hneigh[i*4+1];
-      if(fabs(m[iO].w) <= eq_zero) {
-        compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
-        __threadfence();
-        m[iO].w = qtmp;
-      }
      x1 = m[iO];
    } else {
      iO  = hneigh[i *4  ];
@ -281,12 +311,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
      iH2 = hneigh[iO*4+1];
      if (iO >= inum) {
        non_local_oxy = 1;
-        if(fabs(m[iO].w) <= eq_zero) {
-          compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
-          numtyp qO; fetch(qO,iO,q_tex);
-          __threadfence();
-          m[iO].w = qO;
-        }
      }
    }

@ -341,11 +365,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
            jO = j;
            jH1 = hneigh[j*4  ];
            jH2 = hneigh[j*4+1];
-            if (fabs(m[j].w) <= eq_zero) {
-              compute_newsite(j, jH1, jH2, &m[j], alpha, x_);
-              __threadfence();
-              m[j].w = qj;
-            }
            x2 = m[j];
          }
          delx = x1.x-x2.x;
--- a/lib/gpu/lal_lj_tip4p_long.h
+++ b/lib/gpu/lal_lj_tip4p_long.h
@ -116,7 +116,7 @@ public:
  UCL_D_Vec<int> map_array;
  UCL_D_Vec<int> atom_sametag;

-  UCL_Kernel k_pair_distrib, k_pair_reneigh;
+  UCL_Kernel k_pair_distrib, k_pair_reneigh, k_pair_newsite;

 private:
  bool _allocated;