Switched to the short neighbor list implementation in the pre-10Feb21 version (the recent version enforces tpa = 1 for short nbor)

2021-09-11 00:34:43 -05:00
parent 4ebe5833d3
commit 7f5a82dc54
5 changed files with 103 additions and 54 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -141,14 +141,31 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();

+  // Build the short neighbor list if needed
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+
  this->k_polar.set_size(GX,BX);
  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv, &this->_tep,
                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
                    &this->_threads_per_atom,
                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_avail = false;
+
  return GX;
 }

@ -163,20 +180,22 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {

  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int GX;
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));

-  GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  // Build the short neighbor list if needed
+  if (!this->short_nbor_avail) {
    this->k_short_nbor.set_size(GX,BX);
-  // NOTE: this->nbor->dev_packed is not allocated!!
-/*  
-  this->k_short_nbor.run(&this->atom->x, &_off2,
-                         &this->nbor->dev_nbor,  &this->nbor->dev_packed,
-                         &ainum, &nbor_pitch, &this->_threads_per_atom);
-*/
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+  
  this->k_udirect2b.set_size(GX,BX);
  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
                        &this->_threads_per_atom, &_aewald, &_off2,
                        &_polar_dscale, &_polar_uscale);
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -196,6 +196,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict sp_polar,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
+                            const __global int *dev_short_nbor,
                            __global acctyp4 *restrict ans,
                            __global acctyp *restrict engv,
                            __global numtyp4 *restrict tep,
@ -255,6 +256,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;

    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -262,6 +264,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    ci  = polar1[i].x;    // rpole[i][0];
    dix = polar1[i].y;    // rpole[i][1];
    diy = polar1[i].z;    // rpole[i][2];
@ -289,7 +299,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,

    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int jextra=dev_packed[nbor];
+      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -709,6 +719,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                 const __global numtyp4 *restrict sp_polar,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
                                 __global numtyp4 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
@ -733,6 +744,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -740,6 +752,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    int itype,igroup;
    numtyp bn[4],bcn[3];
    numtyp fid[3],fip[3];
@ -769,7 +789,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,

    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int jextra=dev_packed[nbor];
+      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -1093,7 +1113,6 @@ __kernel void k_special15(__global int * dev_nbor,

  } // if ii
 }
-
 /*
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
                                   const numtyp off2, __global int * dev_nbor,
@ -1149,38 +1168,36 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
  }
 }
 */
-#ifdef LAL_SIMD_IP_SYNC
-#define t_per_atom t_per_atom_in
-#else
-#define t_per_atom 1
-#endif
-
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
-                                   const numtyp off2,
-                                   __global int * dev_nbor,
+                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const numtyp off2,
                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom_in) {
-  const int ii=GLOBAL_ID_X;
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);

  if (ii<inum) {
-/*    
-    const int i=dev_packed[ii];
-    
-    int nbor=ii+nbor_pitch;
-    const int numj=dev_packed[nbor];
-    nbor+=nbor_pitch;
-    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int newj=0;

-    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
-    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK15;

-    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-      int sj=dev_packed[nbor];
-      int j = sj & NEIGHMASK15;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

      // Compute r12
@ -1190,15 +1207,14 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      if (rsq<off2) {
-        //*out_list=sj;
-        out_list++;
-        newj++;
-        if ((newj & (t_per_atom-1))==0)
-          out_list+=out_stride;
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
      }
-
    } // for nbor
-    //dev_nbor[ii+nbor_pitch]=newj;
-*/
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
  } // if ii
 }
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -21,7 +21,7 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;

 template <class numtyp, class acctyp>
-BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
@ -101,8 +101,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
    _nbor_data=&(nbor->dev_nbor);
  }

+  bool allocate_packed = false;
  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
-                              _gpu_host,max_nbors,cell_size,false,_threads_per_atom);
+                              _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom);
  if (success!=0)
    return success;
                              
@ -126,6 +127,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  if (ef_nall==0)
    ef_nall=2000;

+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+
  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);

@ -158,6 +161,7 @@ void BaseAmoebaT::clear_atomic() {
  time_pair.clear();
  hd_balancer.clear();

+  dev_short_nbor.clear();
  nbor->clear();
  ans->clear();

@ -195,7 +199,7 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
+inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
                                         const int nall, double **host_x,
                                         int *host_type, double *sublo,
                                         double *subhi, tagint *tag,
@ -206,7 +210,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
  resize_atom(inum,nall,success);
  resize_local(inum,host_inum,nbor->max_nbors(),success);
  if (!success)
-    return;
+    return 0;
  atom->cast_copy_x(host_x,host_type);

  int mn;
@ -232,6 +236,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
+  return mn;
 }

 // ---------------------------------------------------------------------------
@ -385,7 +390,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall

  // Build neighbor list on GPU if necessary
  if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
                    success);
    if (!success)
@ -409,6 +414,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                     boxlo, prd);

+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
  return nbor->host_jlist.begin()-host_start;
 }

--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -123,7 +123,7 @@ class BaseAmoeba {
                    int **firstneigh, bool &success);

  /// Build neighbor list on device
-  void build_nbor_list(const int inum, const int host_inum,
+  int build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
                       double *sublo, double *subhi, tagint *tag, int **nspecial,
                       tagint **special, int *nspecial15, tagint **special15,
@ -236,6 +236,8 @@ class BaseAmoeba {

  int add_onefive_neighbors();

+  UCL_D_Vec<int> dev_short_nbor;
+
  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
  UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
@ -251,8 +253,9 @@ class BaseAmoeba {
  bool _compiled;
  int _block_size, _block_bio_size, _threads_per_atom;
  int _extra_fields;
-  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
+  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
  double _gpu_overhead, _driver_overhead;
+  bool short_nbor_avail;
  UCL_D_Vec<int> *_nbor_data;

  void compile_kernels(UCL_Device &dev, const void *pair_string,
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)

  gpu_udirect2b_ready = true;
  gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = false;
+  gpu_polar_real_ready = true;

  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }