From 68206079da39fbef08c37e5c1cef1ae8bd9ad2c6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 7 Jul 2017 16:47:24 -0500
Subject: [PATCH 01/11] Supported short neighbor lists for 3-body kernels in
 sw/gpu and vashishta/gpu

---
 lib/gpu/lal_aux_fun1.h     |  24 +++----
 lib/gpu/lal_base_three.cpp |  65 ++++++++++++++----
 lib/gpu/lal_base_three.h   |  27 ++++----
 lib/gpu/lal_sw.cpp         |  26 ++++++--
 lib/gpu/lal_sw.cu          | 117 +++++++++++++++++++++++++++++---
 lib/gpu/lal_vashishta.cpp  |  34 +++++++---
 lib/gpu/lal_vashishta.cu   | 133 +++++++++++++++++++++++++++++--------
 lib/gpu/lal_vashishta.h    |   1 +
 8 files changed, 332 insertions(+), 95 deletions(-)
diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h
index b40bb7f943..47a216ff6f 100644
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@@ -22,21 +22,21 @@
   offset=tid & (t_per_atom-1);                                               \
   ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
 
-#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
-                  i, numj, stride, nbor_end, nbor_begin)                     \
-  i=nbor_mem[ii];                                                            \
-  nbor_begin=ii+nbor_stride;                                                 \
-  numj=nbor_mem[nbor_begin];                                                 \
-  if (nbor_mem==packed_mem) {                                                \
-    nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1);                       \
-    stride=fast_mul(t_per_atom,nbor_stride);                                 \
-    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
+#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset,  \
+                  i, numj, n_stride, nbor_end, nbor_begin)                   \
+  i=dev_nbor[ii];                                                            \
+  nbor_begin=ii+nbor_pitch;                                                  \
+  numj=dev_nbor[nbor_begin];                                                 \
+  if (dev_nbor==dev_packed) {                                                \
+    nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1);                        \
+    n_stride=fast_mul(t_per_atom,nbor_pitch);                                \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
     nbor_begin+=offset;                                                      \
   } else {                                                                   \
-    nbor_begin+=nbor_stride;                                                 \
-    nbor_begin=nbor_mem[nbor_begin];                                         \
+    nbor_begin+=nbor_pitch;                                                  \
+    nbor_begin=dev_nbor[nbor_begin];                                         \
     nbor_end=nbor_begin+numj;                                                \
-    stride=t_per_atom;                                                       \
+    n_stride=t_per_atom;                                                     \
     nbor_begin+=offset;                                                      \
   }
 
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index f772e36295..fd9fc7f272 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0)  {
+BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _short_nbor(false) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
                            const int max_nbors, const int maxspecial,
                            const double cell_size, const double gpu_split,
                            FILE *_screen, const void *pair_program,
-                           const char *k_two, const char *k_three_center,
-                           const char *k_three_end) {
+                           const char *two, const char *three_center,
+                           const char *three_end, const char *short_nbor) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
+  if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
     nbor->packing(true);
     _nbor_data=&(nbor->dev_packed);
-  } else
+  } else  // neigh yes or tpa == 1
     _nbor_data=&(nbor->dev_nbor);
   if (_threads_per_atom*_threads_per_atom>device->warp_size())
     return -10;
@@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
 
   _block_pair=device->pair_block_size();
   _block_size=device->block_ellipse();
-  compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
+  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -113,6 +113,15 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   _max_an_bytes+=ans2->gpu_bytes();
   #endif
 
+  // if short neighbor list is supported
+  if (short_nbor) {
+    _short_nbor = true;
+    int ef_nall=nall;
+    if (ef_nall==0)
+      ef_nall=2000;
+    dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+  }
+
   return 0;
 }
 
@@ -136,6 +145,7 @@ void BaseThreeT::clear_atomic() {
     k_three_end.clear();
     k_three_end_vatom.clear();
     k_pair.clear();
+    k_short_nbor.clear();
     delete pair_program;
     _compiled=false;
   }
@@ -143,6 +153,7 @@ void BaseThreeT::clear_atomic() {
   time_pair.clear();
   hd_balancer.clear();
 
+  dev_short_nbor.clear();
   nbor->clear();
   ans->clear();
   #ifdef THREE_CONCURRENT
@@ -247,12 +258,26 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
     reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
     if (!success)
       return;
+    _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
   }
 
   atom->cast_x_data(host_x,host_type);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
 
+  // if short neighbor list is supported
+  if (_short_nbor) {
+
+    // re-allocate dev_short_nbor if necessary
+    if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+      int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+      dev_short_nbor.resize((2+_max_nbors)*_nmax);
+    }
+  }
+
+  // _ainum to be used in loop() for short neighbor list build
+  _ainum = nlist;
+
   int evatom=0;
   if (eatom || vatom)
     evatom=1;
@@ -300,7 +325,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
@@ -313,6 +338,19 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
+  // if short neighbor list is supported
+  if (_short_nbor) {
+
+    // re-allocate dev_short_nbor if necessary
+    if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+      int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+      dev_short_nbor.resize((2+_max_nbors)*_nmax);
+    }
+  }
+
+  // _ainum to be used in loop() for short neighbor list build
+  _ainum = nall;
+
   int evatom=0;
   if (eatom || vatom)
     evatom=1;
@@ -339,19 +377,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                 const char *ktwo, const char *kthree_center,
-                                 const char *kthree_end) {
+                                 const char *two, const char *three_center,
+                                 const char *three_end, const char* short_nbor) {
   if (_compiled)
     return;
 
-  std::string vatom_name=std::string(kthree_end)+"_vatom";
+  std::string vatom_name=std::string(three_end)+"_vatom";
 
   pair_program=new UCL_Program(dev);
   pair_program->load_string(pair_str,device->compile_string().c_str());
-  k_three_center.set_function(*pair_program,kthree_center);
-  k_three_end.set_function(*pair_program,kthree_end);
+  k_three_center.set_function(*pair_program,three_center);
+  k_three_end.set_function(*pair_program,three_end);
   k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
-  k_pair.set_function(*pair_program,ktwo);
+  k_pair.set_function(*pair_program,two);
+  if (short_nbor) k_short_nbor.set_function(*pair_program,short_nbor);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
   #ifdef THREE_CONCURRENT
diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h
index 4f27ecdf92..d03a7521cd 100644
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@@ -56,7 +56,8 @@ class BaseThree {
                  const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *screen,
                  const void *pair_program, const char *k_two,
-                 const char *k_three_center, const char *k_three_end);
+                 const char *k_three_center, const char *k_three_end,
+                 const char *k_short_nbor=NULL);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -74,8 +75,8 @@ class BaseThree {
 
   /// Check if there is enough storage for neighbors and realloc if not
   /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
+    * \param max_nbors maximum number of neighbors
+    * \param success set to false if insufficient memory
     * \note olist_size=total number of local particles **/
   inline void resize_local(const int inum, const int max_nbors, bool &success) {
     nbor->resize(inum,max_nbors,success);
@@ -84,7 +85,7 @@ class BaseThree {
   /// Check if there is enough storage for neighbors and realloc if not
   /** \param nlocal number of particles whose nbors must be stored on device
     * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
+    * \param max_nbors current maximum number of neighbors
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
@@ -143,14 +144,6 @@ class BaseThree {
                const bool vflag, const bool eatom, const bool vatom,
                int &host_start, const double cpu_time, bool &success);
 
-  /// Pair loop with device neighboring
-  int * compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                const double cpu_time, bool &success);
-
   /// Pair loop with device neighboring
   int ** compute(const int ago, const int inum_full,
                  const int nall, double **host_x, int *host_type, double *sublo,
@@ -193,6 +186,9 @@ class BaseThree {
   /// Neighbor data
   Neighbor *nbor;
 
+  UCL_D_Vec<int> dev_short_nbor;
+  UCL_Kernel k_short_nbor;
+
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
   UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@@ -203,16 +199,17 @@ class BaseThree {
   UCL_Texture pos_tex;
 
  protected:
-  bool _compiled;
+  bool _compiled,_short_nbor;
   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
   int _gpu_nbor;
   double _max_bytes, _max_an_bytes;
+  int _max_nbors, _ainum;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-                       const char *k_two, const char *k_three_center,
-                       const char *k_three_end);
+                       const char *two, const char *three_center,
+                       const char *three_end, const char* short_nbor);
 
   virtual void loop(const bool _eflag, const bool _vflag,
                     const int evatom) = 0;
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index 3492d7030e..24984e4878 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,sw,"k_sw","k_sw_three_center",
-                           "k_sw_three_end");
+                           "k_sw_three_end","k_sw_short_nbor");
   if (success!=0)
     return success;
 
@@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
                                (BX/this->_threads_per_atom)));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
 
   // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
   // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
   this->time_pair.start();
-
+  
   this->k_pair.set_size(GX,BX);
   this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
                    &map, &elem2param, &_nelements,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
@@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
                            &map, &elem2param, &_nelements,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index 46330c59e4..7dea52898e 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -130,6 +130,64 @@ texture<int4> sw3_tex;
 
 #endif
 
+__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
+                           const __global numtyp4 *restrict sw3,
+                           const __global int *restrict map,
+                           const __global int *restrict elem2param,
+                           const int nelements,
+                           const __global int * dev_nbor,
+                           const __global int * dev_packed,
+                           __global int * dev_short_nbor,
+                           const int inum, const int nbor_pitch, const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
 
 __kernel void k_sw(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict sw1,
@@ -140,6 +198,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                    const int nelements,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
+                   const __global int * dev_short_nbor,
                    __global acctyp4 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
@@ -167,9 +226,14 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor];
+    nbor += n_stride;
+    nbor_end = nbor+fast_mul(numj,n_stride);
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=dev_short_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -337,6 +401,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                 const int nelements,
                                 const __global int * dev_nbor,
                                 const __global int * dev_packed,
+                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
@@ -371,9 +436,15 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -395,14 +466,16 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
       sw_cut_ij=sw3_ijparam.x;
 
-      int nbor_k=nbor_j-offset_j+offset_k;
-      if (nbor_k<=nbor_j)
-        nbor_k+=n_stride;
+      int nbor_k=nborj_start-offset_j+offset_k;
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
 
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
+        if (k <= j) continue;
+
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
@@ -460,6 +533,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -494,8 +568,13 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -534,8 +613,13 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -598,6 +682,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -632,8 +717,13 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -672,8 +762,13 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp
index 96537e65d3..55318bb3b0 100644
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,vashishta,"k_vashishta","k_vashishta_three_center",
-                           "k_vashishta_three_end");
+                           "k_vashishta_three_end","k_vashishta_short_nbor");
   if (success!=0)
     return success;
 
@@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
 
   param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
+  double r0sqmax = 0;
   for (int i=0; i<nparams; i++) {
-    double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
-
+    double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
+    if (r0sqmax < r0sq) r0sqmax = r0sq;
     dview[i].x=static_cast<numtyp>(r0sq);
     dview[i].y=static_cast<numtyp>(gamma[i]);
     dview[i].z=static_cast<numtyp>(cutsq[i]);
     dview[i].w=static_cast<numtyp>(r0[i]);
   }
 
+  _cutshortsq = static_cast<numtyp>(r0sqmax);
+
   ucl_copy(param4,dview,false);
   param4_tex.get_texture(*(this->pair_program),"param4_tex");
   param4_tex.bind_float(param4,4);
@@ -223,15 +226,27 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
                                (BX/this->_threads_per_atom)));
 
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
   // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
   // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
+  // note that k_pair does not run with the short neighbor list
   this->k_pair.set_size(GX,BX);
   this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                    &map, &elem2param, &_nelements,
@@ -248,6 +263,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                            &map, &elem2param, &_nelements,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
   Answer<numtyp,acctyp> *end_ans;
@@ -257,21 +273,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   end_ans=this->ans;
   #endif
   if (evatom!=0) {
-    
     this->k_three_end_vatom.set_size(GX,BX);
     this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   } else {
-    
     this->k_three_end.set_size(GX,BX);
     this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index caa3c03613..7449b18f6b 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -136,6 +136,56 @@ texture<int4> param5_tex;
 
 #endif
 
+__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
+                                     const numtyp cutshortsq,
+                                     const __global int * dev_nbor,
+                                     const __global int * dev_packed,
+                                     __global int * dev_short_nbor,
+                                     const int inum, const int nbor_pitch,
+                                     const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutshortsq) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
 
 __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict param1,
@@ -176,7 +226,6 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
-
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -211,7 +260,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         numtyp param3_dvrc=param3_ijparam.z;
         numtyp param3_c0  =param3_ijparam.w;
 
-        numtyp r=sqrt(rsq);
+        numtyp r=ucl_sqrt(rsq);
         numtyp rinvsq=1.0/rsq;
         numtyp r4inv = rinvsq*rinvsq;
         numtyp r6inv = rinvsq*r4inv;
@@ -219,8 +268,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         numtyp reta = pow(r,-param1_eta);
         numtyp lam1r = r*param1_lam1inv;
         numtyp lam4r = r*param1_lam4inv;
-        numtyp vc2 = param1_zizj * exp(-lam1r)/r;
-        numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
+        numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
+        numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
 
         numtyp force = (param2_dvrc*r
             - (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
@@ -255,31 +304,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
   numtyp rainv1 = ucl_recip(r1 - param_r0_ij);                               \
-  numtyp gsrainv1 = param_gamma_ij * rainv1;                                    \
+  numtyp gsrainv1 = param_gamma_ij * rainv1;                                 \
   numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
   numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                              \
   numtyp r2 = ucl_sqrt(rsq2);                                                \
   numtyp rinvsq2 = ucl_recip(rsq2);                                          \
   numtyp rainv2 = ucl_recip(r2 - param_r0_ik);                               \
-  numtyp gsrainv2 = param_gamma_ik * rainv2;                                    \
+  numtyp gsrainv2 = param_gamma_ik * rainv2;                                 \
   numtyp gsrainvsq2 = gsrainv2*rainv2/r2;                                    \
   numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                              \
   numtyp rinv12 = ucl_recip(r1*r2);                                          \
   numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - param_costheta_ijk;                                       \
+  numtyp delcs = cs - param_costheta_ijk;                                    \
   numtyp delcssq = delcs*delcs;                                              \
-  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                   \
+  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                \
   numtyp pcsinvsq = pcsinv*pcsinv;                                           \
   numtyp pcs = delcssq/pcsinv;                                               \
                                                                              \
   numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                              \
-  numtyp facrad = param_bigb_ijk * facexp*pcs;                                  \
+  numtyp facrad = param_bigb_ijk * facexp*pcs;                               \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
   numtyp frad2 = facrad*gsrainvsq2;                                          \
-  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                      \
+  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                   \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -311,28 +360,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
   numtyp rainv1 = ucl_recip(r1 - param_r0_ij);                               \
-  numtyp gsrainv1 = param_gamma_ij * rainv1;                                    \
+  numtyp gsrainv1 = param_gamma_ij * rainv1;                                 \
   numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
   numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                              \
   numtyp r2 = ucl_sqrt(rsq2);                                                \
   numtyp rainv2 = ucl_recip(r2 - param_r0_ik);                               \
-  numtyp gsrainv2 = param_gamma_ik * rainv2;                                    \
+  numtyp gsrainv2 = param_gamma_ik * rainv2;                                 \
   numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                              \
   numtyp rinv12 = ucl_recip(r1*r2);                                          \
   numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - param_costheta_ijk;                                       \
+  numtyp delcs = cs - param_costheta_ijk;                                    \
   numtyp delcssq = delcs*delcs;                                              \
-  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                   \
+  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                \
   numtyp pcsinvsq = pcsinv*pcsinv;                                           \
   numtyp pcs = delcssq/pcsinv;                                               \
                                                                              \
   numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                              \
-  numtyp facrad = param_bigb_ijk * facexp*pcs;                                  \
+  numtyp facrad = param_bigb_ijk * facexp*pcs;                               \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
-  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                      \
+  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                   \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -353,6 +402,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                 const int nelements,
                                 const __global int * dev_nbor,
                                 const __global int * dev_packed,
+                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
@@ -387,9 +437,14 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
-    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
-      int j=dev_packed[nbor_j];
+    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -410,14 +465,16 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
       
-      int nbor_k=nbor_j-offset_j+offset_k;
-      if (nbor_k<=nbor_j)
-        nbor_k+=n_stride;
+      int nbor_k=nborj_start-offset_j+offset_k;
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
 
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
+        if (k <= j) continue;
+
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
@@ -478,6 +535,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -512,8 +570,13 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -551,8 +614,13 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -617,6 +685,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -651,8 +720,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -690,8 +764,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
diff --git a/lib/gpu/lal_vashishta.h b/lib/gpu/lal_vashishta.h
index 6eea8362cc..c5c96a5b80 100644
--- a/lib/gpu/lal_vashishta.h
+++ b/lib/gpu/lal_vashishta.h
@@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
   int _nparams,_nelements;
+  numtyp _cutshortsq;
 
   UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;
 

From 1c6533e53df2e4d69cb40228baf69677b5941405 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 8 Jul 2017 14:15:26 -0500
Subject: [PATCH 02/11] Working on short neighbor list for tersoff/gpu

---
 lib/gpu/lal_tersoff.cpp |  81 +++++++++++---------
 lib/gpu/lal_tersoff.cu  | 165 ++++++++++++++++++++++++++++++++--------
 lib/gpu/lal_tersoff.h   |   3 +-
 3 files changed, 178 insertions(+), 71 deletions(-)

diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index 6b0b563d9f..f1e0320b8c 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff,"k_tersoff_repulsive",
-                           "k_tersoff_three_center", "k_tersoff_three_end");
+                           "k_tersoff_three_center", "k_tersoff_three_end",
+                           "k_tersoff_short_nbor");
   if (success!=0)
     return success;
 
@@ -157,8 +158,12 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
 
   UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
     cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
+  _cutshortsq = static_cast<numtyp>(cutsqmax);
   cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(cutsq,cutsq_view,false);
 
@@ -250,7 +255,7 @@ void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
     this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
     if (!success)
       return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
+    this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
   }
 
   this->atom->cast_x_data(host_x,host_type);
@@ -258,29 +263,19 @@ void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
   this->atom->add_x_data(host_x,host_type);
 
   // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
+  if (nall*this->_max_nbors > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
+    _zetaij.resize(this->_max_nbors*_nmax);
   }
 
+  this->_ainum=nlist;
+
   int _eflag;
   if (eflag)
     _eflag=1;
   else
     _eflag=0;
 
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
   int evatom=0;
   if (eatom || vatom)
     evatom=1;
@@ -329,7 +324,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
@@ -343,29 +338,19 @@ int ** TersoffT::compute(const int ago, const int inum_full,
   *jnum=this->nbor->host_acc.begin();
 
   // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
+  if (nall*this->_max_nbors > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
+    _zetaij.resize(this->_max_nbors*_nmax);
   }
 
+  this->_ainum=nall;
+
   int _eflag;
   if (eflag)
     _eflag=1;
   else
     _eflag=0;
 
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
   int evatom=0;
   if (eatom || vatom)
     evatom=1;
@@ -402,9 +387,32 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
   int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   this->time_pair.start();
@@ -423,6 +431,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -437,7 +446,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -446,7 +455,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index b7d48d9e34..d132545984 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -164,6 +164,57 @@ texture<int4> ts5_tex;
 
 #endif
 
+__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
+                                   const numtyp cutshortsq,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutshortsq) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@@ -184,6 +235,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
@@ -218,15 +270,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -247,8 +304,11 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -284,9 +344,10 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
       store_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@@ -428,6 +489,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
                                      const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
@@ -466,15 +528,20 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -497,9 +564,10 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
@@ -520,9 +588,12 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
         virial[5] += delr1[1]*delr1[2]*mforce;
       }
 
-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -598,6 +669,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
+                                  const __global int * dev_short_nbor, 
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -643,9 +715,15 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -683,13 +761,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -711,9 +794,10 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -736,7 +820,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -777,9 +861,10 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k - n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -824,6 +909,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
                                         const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
@@ -869,9 +955,15 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -909,13 +1001,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -937,9 +1034,10 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -962,7 +1060,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1010,9 +1108,10 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k - n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
 
diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h
index c72ebd7286..5e2bedfeef 100644
--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@@ -104,8 +104,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
 
   UCL_Kernel k_zeta;
   UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-
-  int _max_nbors;
+  numtyp _cutshortsq;
 
  private:
   bool _allocated;

From 77c60189b8cd11d6f1f5329c6e22ff4d105b7c8a Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 8 Jul 2017 14:43:53 -0500
Subject: [PATCH 03/11] Minor cleanups for tersoff/gpu

---
 lib/gpu/lal_tersoff.cpp | 3 ++-
 lib/gpu/lal_tersoff.cu  | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index f1e0320b8c..d830499257 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -163,10 +163,11 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
     cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
     if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
   }
-  _cutshortsq = static_cast<numtyp>(cutsqmax);
   cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(cutsq,cutsq_view,false);
 
+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
 
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index d132545984..9faa59c34d 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -106,7 +106,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
@@ -155,7 +155,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       z += shfl_xor(z, s, t_per_atom);                                      \
@@ -348,7 +348,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       int idx = nbor_j - n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
       numtyp ijparam_lam2 = ts1_ijparam.y;

From 34fe2273f64bbee8f96ab91642106471ad77c25b Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 8 Jul 2017 14:59:48 -0500
Subject: [PATCH 04/11] Added short neighbor list implementation for
 tersoff/zbl/gpu and tersoff/mod/gpu

---
 lib/gpu/lal_tersoff_mod.cpp |  58 +++++++++---
 lib/gpu/lal_tersoff_mod.cu  | 173 ++++++++++++++++++++++++++++--------
 lib/gpu/lal_tersoff_mod.h   |   3 +-
 lib/gpu/lal_tersoff_zbl.cpp |  58 +++++++++---
 lib/gpu/lal_tersoff_zbl.cu  | 171 +++++++++++++++++++++++++++--------
 lib/gpu/lal_tersoff_zbl.h   |   2 +-
 6 files changed, 365 insertions(+), 100 deletions(-)

diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index 553dad3583..ba1804c37e 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_mod,"k_tersoff_mod_repulsive",
-                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
+                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
+                           "k_tersoff_mod_short_nbor");
   if (success!=0)
     return success;
 
@@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
 
   UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
     cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
   cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(cutsq,cutsq_view,false);
 
+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
 
@@ -250,7 +256,7 @@ void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
     this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
     if (!success)
       return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
+    this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
   }
 
   this->atom->cast_x_data(host_x,host_type);
@@ -258,11 +264,13 @@ void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
   this->atom->add_x_data(host_x,host_type);
 
   // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
+  if (nall*this->_max_nbors > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
+    _zetaij.resize(this->_max_nbors*_nmax);
   }
 
+  this->_ainum=nlist;
+
   int _eflag;
   if (eflag)
     _eflag=1;
@@ -329,7 +337,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
@@ -343,11 +351,13 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
   *jnum=this->nbor->host_acc.begin();
 
   // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
+  if (nall*this->_max_nbors > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
+    _zetaij.resize(this->_max_nbors*_nmax);
   }
 
+  this->_ainum=nall;
+
   int _eflag;
   if (eflag)
     _eflag=1;
@@ -402,9 +412,32 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
   int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   this->time_pair.start();
@@ -423,6 +456,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -437,7 +471,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -446,7 +480,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 3a81b36941..75bacc2179 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -106,7 +106,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
@@ -155,7 +155,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       z += shfl_xor(z, s, t_per_atom);                                      \
@@ -164,6 +164,57 @@ texture<int4> ts5_tex;
 
 #endif
 
+__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
+                                   const numtyp cutshortsq,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutshortsq) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@@ -184,6 +235,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
@@ -218,15 +270,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -244,11 +301,14 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       if (rsq1 > cutsq[ijparam]) continue;
 
       // compute zeta_ij
-      z = (numtyp)0;
+      z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -287,10 +347,11 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
       numtyp ijparam_lam2 = ts1_ijparam.y;
@@ -430,6 +491,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
                                      const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
@@ -470,15 +532,20 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -501,9 +568,10 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
@@ -524,9 +592,12 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         virial[5] += delr1[1]*delr1[2]*mforce;
       }
 
-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -606,6 +677,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
+                                  const __global int * dev_short_nbor, 
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -653,9 +725,15 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -693,13 +771,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -721,9 +804,10 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -746,7 +830,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -790,9 +874,10 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k - n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -841,6 +926,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
                                         const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
@@ -888,9 +974,15 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -928,13 +1020,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -956,9 +1053,10 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -981,7 +1079,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1032,9 +1130,10 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k - n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
 
diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h
index 9a05c66009..286a23fa6f 100644
--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@@ -104,8 +104,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
 
   UCL_Kernel k_zeta;
   UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-
-  int _max_nbors;
+  numtyp _cutshortsq;
 
  private:
   bool _allocated;
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 9cce8a802d..6efa8b9487 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
-                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
+                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
+                           "k_tersoff_zbl_short_nbor");
   if (success!=0)
     return success;
 
@@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
 
   UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
     cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
   cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(cutsq,cutsq_view,false);
 
+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
 
@@ -275,7 +281,7 @@ void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
     this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
     if (!success)
       return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
+    this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
   }
 
   this->atom->cast_x_data(host_x,host_type);
@@ -283,11 +289,13 @@ void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
   this->atom->add_x_data(host_x,host_type);
 
   // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
+  if (nall*this->_max_nbors > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
+    _zetaij.resize(this->_max_nbors*_nmax);
   }
 
+  this->_ainum=nlist;
+
   int _eflag;
   if (eflag)
     _eflag=1;
@@ -354,7 +362,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
@@ -368,11 +376,13 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
   *jnum=this->nbor->host_acc.begin();
 
   // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
+  if (nall*this->_max_nbors > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
+    _zetaij.resize(this->_max_nbors*_nmax);
   }
 
+  this->_ainum=nall;
+
   int _eflag;
   if (eflag)
     _eflag=1;
@@ -427,9 +437,32 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
   int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   this->time_pair.start();
@@ -449,6 +482,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -463,7 +497,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -472,7 +506,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 9509b9802c..439d4028df 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -109,7 +109,7 @@ texture<int4> ts6_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
@@ -158,7 +158,7 @@ texture<int4> ts6_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       z += shfl_xor(z, s, t_per_atom);                                      \
@@ -167,6 +167,57 @@ texture<int4> ts6_tex;
 
 #endif
 
+__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
+                                   const numtyp cutshortsq,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutshortsq) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@@ -188,6 +239,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
@@ -224,15 +276,20 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -253,8 +310,11 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -290,10 +350,11 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
       numtyp ijparam_lam2 = ts1_ijparam.y;
@@ -448,6 +509,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
                                      const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
@@ -486,15 +548,20 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    int nborj_start = nbor_j;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -517,9 +584,10 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
@@ -540,9 +608,12 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         virial[5] += delr1[1]*delr1[2]*mforce;
       }
 
-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int numk = dev_short_nbor[nbor_k-n_stride];
+      int k_end = nbor_k+fast_mul(numk,n_stride);
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -618,6 +689,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
+                                  const __global int * dev_short_nbor, 
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -663,9 +735,15 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -703,13 +781,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -731,9 +814,10 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -756,7 +840,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -797,9 +881,10 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k - n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -844,6 +929,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
                                         const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
@@ -889,9 +975,15 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor_j];
+    nbor_j += n_stride;
+    nbor_end = nbor_j+fast_mul(numj,n_stride);
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=dev_short_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -929,13 +1021,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      numk = dev_short_nbor[nbor_k];
+      nbor_k += n_stride;
+      k_end = nbor_k+fast_mul(numk,n_stride);
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -957,9 +1054,10 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum - n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -982,7 +1080,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=dev_short_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1030,9 +1128,10 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k - n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
 
diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h
index cc0b848845..a5f1ace754 100644
--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@@ -109,8 +109,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   UCL_Kernel k_zeta;
   UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
 
-  int _max_nbors;
   numtyp _global_e,_global_a_0,_global_epsilon_0;
+  numtyp _cutshortsq;
 
  private:
   bool _allocated;

From ea2b01e83b910d29ef487ff51c88b22d8059d434 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 8 Jul 2017 20:17:31 -0500
Subject: [PATCH 05/11] Refactored 3-body gpu styles to remove code duplication

---
 lib/gpu/lal_base_three.cpp  |   4 +
 lib/gpu/lal_base_three.h    |   6 +-
 lib/gpu/lal_tersoff.cpp     | 152 ++-----------------------------
 lib/gpu/lal_tersoff.h       |  15 ---
 lib/gpu/lal_tersoff_mod.cpp | 176 ++----------------------------------
 lib/gpu/lal_tersoff_mod.h   |  15 ---
 lib/gpu/lal_tersoff_zbl.cpp | 176 ++----------------------------------
 lib/gpu/lal_tersoff_zbl.h   |  15 ---
 8 files changed, 28 insertions(+), 531 deletions(-)

diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index fd9fc7f272..4e8a95937c 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -180,6 +180,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
   if (!success)
     return NULL;
 
+  _nall = nall;
+
   // originally the requirement that nall == nlist was enforced
   // to allow direct indexing neighbors of neighbors after re-arrangement
 //  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
@@ -214,6 +216,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
     return 0;
   atom->cast_copy_x(host_x,host_type);
 
+  _nall = nall;
+
   int mn;
   nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
                         nspecial, special, success, mn);
diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h
index d03a7521cd..fde1936b25 100644
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@@ -74,7 +74,7 @@ class BaseThree {
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
+  /** \param inum number of particles whose nbors must be stored on device
     * \param max_nbors maximum number of neighbors
     * \param success set to false if insufficient memory
     * \note olist_size=total number of local particles **/
@@ -83,7 +83,7 @@ class BaseThree {
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
+  /** \param inum number of particles whose nbors must be stored on device
     * \param host_inum number of particles whose nbors need to copied to host
     * \param max_nbors current maximum number of neighbors
     * \note host_inum is 0 if the host is performing neighboring
@@ -203,7 +203,7 @@ class BaseThree {
   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
   int _gpu_nbor;
   double _max_bytes, _max_an_bytes;
-  int _max_nbors, _ainum;
+  int _max_nbors, _ainum, _nall;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index d830499257..cb4a3fdbd6 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -225,151 +225,6 @@ double TersoffT::host_memory_usage() const {
 
 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*this->_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
-  }
-
-  this->_ainum=nlist;
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*this->_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
-  }
-
-  this->_ainum=nall;
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@@ -400,6 +255,13 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                  &this->dev_short_nbor, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
 
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
   nbor_pitch=this->nbor->nbor_pitch();
   GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
                                (BX/(JTHREADS*KTHREADS))));
diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h
index 5e2bedfeef..fd01af031a 100644
--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
            const double* h, const double* gamma, const double* beta,
            const double* powern, const double* cutsq);
 
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index ba1804c37e..02000d77d3 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -225,175 +225,6 @@ double TersoffMT::host_memory_usage() const {
 
 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*this->_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
-  }
-
-  this->_ainum=nlist;
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffMT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*this->_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
-  }
-
-  this->_ainum=nall;
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@@ -424,6 +255,13 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                  &this->dev_short_nbor, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
 
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
   nbor_pitch=this->nbor->nbor_pitch();
   GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
                                (BX/(JTHREADS*KTHREADS))));
diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h
index 286a23fa6f..ab1560d951 100644
--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
            const double* h, const double* beta, const double* powern,
            const double* powern_del, const double* ca1, const double* cutsq);
 
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 6efa8b9487..33edabd799 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -250,175 +250,6 @@ double TersoffZT::host_memory_usage() const {
 
 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*this->_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
-  }
-
-  this->_ainum=nlist;
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffZT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*this->_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
-  }
-
-  this->_ainum=nall;
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@@ -449,6 +280,13 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                  &this->dev_short_nbor, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
 
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
   nbor_pitch=this->nbor->nbor_pitch();
   GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
                                (BX/(JTHREADS*KTHREADS))));
diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h
index a5f1ace754..0e6cac9587 100644
--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
            const double* ZBLcut, const double* ZBLexpscale, const double global_e,
            const double global_a_0, const double global_epsilon_0, const double* cutsq);
 
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();

From 8c9db3ea0010be90236beb490e8deb64c9c2a785 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 10 Jul 2017 23:50:21 -0500
Subject: [PATCH 06/11] Built 2-body short neighbor list and used for 2-body
 kernels in tersoff gpu styles

---
 lib/gpu/lal_tersoff.cpp     |  4 +-
 lib/gpu/lal_tersoff.cu      | 75 ++++++++++++++++++--------------
 lib/gpu/lal_tersoff_mod.cpp |  4 +-
 lib/gpu/lal_tersoff_mod.cu  | 73 ++++++++++++++++++-------------
 lib/gpu/lal_tersoff_zbl.cpp |  4 +-
 lib/gpu/lal_tersoff_zbl.cu  | 85 +++++++++++++++++++++----------------
 6 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index cb4a3fdbd6..a63d286d9c 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -250,7 +250,8 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                                (BX/this->_threads_per_atom)));
 
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                  &this->dev_short_nbor, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
@@ -283,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index 9faa59c34d..0026fb97cb 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -165,7 +165,10 @@ texture<int4> ts5_tex;
 #endif
 
 __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
-                                   const numtyp cutshortsq,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
                                    const __global int * dev_nbor,
                                    const __global int * dev_packed,
                                    __global int * dev_short_nbor,
@@ -182,6 +185,8 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
 
     int ncount = 0;
     int m = nbor;
@@ -195,6 +200,9 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -202,7 +210,7 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutshortsq) {
+      if (rsq<cutsq[ijparam]) {
         dev_short_nbor[nbor_short] = nj;
         nbor_short += n_stride;
         ncount++;
@@ -298,7 +306,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       // compute zeta_ij
       z = (acctyp)0;
@@ -391,6 +399,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -426,9 +435,14 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor];
+    nbor += n_stride;
+    nbor_end = nbor+fast_mul(numj,n_stride);
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=dev_short_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -443,32 +457,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;
 
-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                rsq, eflag, feng);
 
-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
 
@@ -556,7 +569,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -669,7 +682,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
-                                  const __global int * dev_short_nbor, 
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -738,7 +751,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
@@ -978,7 +991,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index 02000d77d3..c37c07f1a1 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -250,7 +250,8 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                                (BX/this->_threads_per_atom)));
 
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                  &this->dev_short_nbor, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
@@ -283,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 75bacc2179..555485a1b2 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -165,7 +165,10 @@ texture<int4> ts5_tex;
 #endif
 
 __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
-                                   const numtyp cutshortsq,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
                                    const __global int * dev_nbor,
                                    const __global int * dev_packed,
                                    __global int * dev_short_nbor,
@@ -182,6 +185,8 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
 
     int ncount = 0;
     int m = nbor;
@@ -195,6 +200,9 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -202,7 +210,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutshortsq) {
+      if (rsq<cutsq[ijparam]) {
         dev_short_nbor[nbor_short] = nj;
         nbor_short += n_stride;
         ncount++;
@@ -298,7 +306,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       // compute zeta_ij
       z = (acctyp)0;
@@ -392,6 +400,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -427,9 +436,14 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor];
+    nbor += n_stride;
+    nbor_end = nbor+fast_mul(numj,n_stride);
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=dev_short_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -444,32 +458,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;
 
-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                rsq, eflag, feng);
 
-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
 
@@ -560,7 +573,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -748,7 +761,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
@@ -997,7 +1010,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 33edabd799..827613067c 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -275,7 +275,8 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                                (BX/this->_threads_per_atom)));
 
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                  &this->dev_short_nbor, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
@@ -309,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                    &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 439d4028df..89ae72df8a 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -168,7 +168,10 @@ texture<int4> ts6_tex;
 #endif
 
 __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
-                                   const numtyp cutshortsq,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
                                    const __global int * dev_nbor,
                                    const __global int * dev_packed,
                                    __global int * dev_short_nbor,
@@ -185,6 +188,8 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
 
     int ncount = 0;
     int m = nbor;
@@ -198,6 +203,9 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -205,7 +213,7 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutshortsq) {
+      if (rsq<cutsq[ijparam]) {
         dev_short_nbor[nbor_short] = nj;
         nbor_short += n_stride;
         ncount++;
@@ -304,7 +312,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       // compute zeta_ij
       z = (acctyp)0;
@@ -403,6 +411,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -440,9 +449,14 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    numj = dev_short_nbor[nbor];
+    nbor += n_stride;
+    nbor_end = nbor+fast_mul(numj,n_stride);
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=dev_short_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -457,38 +471,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
-        numtyp4 ts6_ijparam = ts6[ijparam];
-        numtyp ijparam_Z_i = ts6_ijparam.x;
-        numtyp ijparam_Z_j = ts6_ijparam.y;
-        numtyp ijparam_ZBLcut = ts6_ijparam.z;
-        numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;
+      numtyp4 ts6_ijparam = ts6[ijparam];
+      numtyp ijparam_Z_i = ts6_ijparam.x;
+      numtyp ijparam_Z_j = ts6_ijparam.y;
+      numtyp ijparam_ZBLcut = ts6_ijparam.z;
+      numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
 
-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
-                  global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
+                global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
 
-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
 
@@ -576,7 +589,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -758,7 +771,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
@@ -998,7 +1011,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];

From cdac5f496c531a851b1bdc5b6177f590150c9fff Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 11 Jul 2017 00:13:56 -0500
Subject: [PATCH 07/11] Built 3-body short neighbor list for the 3-body kernels
 using per-pair cutoffs for vashishta gpu style

---
 lib/gpu/lal_vashishta.cpp |  3 ++-
 lib/gpu/lal_vashishta.cu  | 22 +++++++++++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp
index 55318bb3b0..d03ac992bd 100644
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@@ -233,7 +233,8 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                                (BX/this->_threads_per_atom)));
 
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
+  this->k_short_nbor.run(&this->atom->x, &param4, &map,
+                 &elem2param, &_nelements, &_nparams,
                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                  &this->dev_short_nbor, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index 7449b18f6b..d226aba6dd 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -137,7 +137,10 @@ texture<int4> param5_tex;
 #endif
 
 __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
-                                     const numtyp cutshortsq,
+                                     const __global numtyp4 *restrict param4,
+                                     const __global int *restrict map,
+                                     const __global int *restrict elem2param,
+                                     const int nelements, const int nparams,
                                      const __global int * dev_nbor,
                                      const __global int * dev_packed,
                                      __global int * dev_short_nbor,
@@ -154,6 +157,8 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
 
     int ncount = 0;
     int m = nbor;
@@ -167,6 +172,9 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -174,7 +182,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutshortsq) {
+      if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
         dev_short_nbor[nbor_short] = nj;
         nbor_short += n_stride;
         ncount++;
@@ -461,7 +469,9 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
       
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij=param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+
+//      if (rsq1 > param_r0sq_ij) continue;
+
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
       
@@ -592,7 +602,8 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij = param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+
+//      if (rsq1 > param_r0sq_ij) continue;
 
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij = param4_ijparam.w;
@@ -742,7 +753,8 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij=param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+
+//      if (rsq1 > param_r0sq_ij) continue;
 
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;

From 3d1d0c58c7322aca3ab8019dfc176875868a9743 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 21 Jul 2017 12:08:04 -0500
Subject: [PATCH 08/11] Cleaned up 3-body gpu styles, and fixed a bug for
 tersoff/zbl/gpu. There is a unresolved bug for neigh no with tpa > 1 with
 BaseThree, enforce tpa = 1 for neigh no in BaseThree for now.

---
 lib/gpu/lal_base_three.cpp  | 41 ++++++++++++++-----------------------
 lib/gpu/lal_base_three.h    |  2 +-
 lib/gpu/lal_tersoff_mod.cu  |  2 +-
 lib/gpu/lal_tersoff_zbl.cpp |  2 +-
 lib/gpu/lal_tersoff_zbl.cu  |  2 +-
 5 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index 4e8a95937c..5f3c57337e 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _short_nbor(false) {
+BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -73,6 +73,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
     nbor->packing(true);
     _nbor_data=&(nbor->dev_packed);
+    _threads_per_atom = 1; // enforce tpa = 1 for now
   } else  // neigh yes or tpa == 1
     _nbor_data=&(nbor->dev_nbor);
   if (_threads_per_atom*_threads_per_atom>device->warp_size())
@@ -113,14 +114,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   _max_an_bytes+=ans2->gpu_bytes();
   #endif
 
-  // if short neighbor list is supported
-  if (short_nbor) {
-    _short_nbor = true;
-    int ef_nall=nall;
-    if (ef_nall==0)
-      ef_nall=2000;
-    dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
-  }
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
 
   return 0;
 }
@@ -269,14 +266,10 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
 
-  // if short neighbor list is supported
-  if (_short_nbor) {
-
-    // re-allocate dev_short_nbor if necessary
-    if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-      int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-      dev_short_nbor.resize((2+_max_nbors)*_nmax);
-    }
+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
   }
 
   // _ainum to be used in loop() for short neighbor list build
@@ -342,14 +335,10 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
-  // if short neighbor list is supported
-  if (_short_nbor) {
-
-    // re-allocate dev_short_nbor if necessary
-    if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-      int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-      dev_short_nbor.resize((2+_max_nbors)*_nmax);
-    }
+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
   }
 
   // _ainum to be used in loop() for short neighbor list build
@@ -394,7 +383,7 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_three_end.set_function(*pair_program,three_end);
   k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
   k_pair.set_function(*pair_program,two);
-  if (short_nbor) k_short_nbor.set_function(*pair_program,short_nbor);
+  k_short_nbor.set_function(*pair_program,short_nbor);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
   #ifdef THREE_CONCURRENT
diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h
index fde1936b25..f5f36863c4 100644
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@@ -199,7 +199,7 @@ class BaseThree {
   UCL_Texture pos_tex;
 
  protected:
-  bool _compiled,_short_nbor;
+  bool _compiled;
   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
   int _gpu_nbor;
   double _max_bytes, _max_an_bytes;
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 555485a1b2..257d3972ed 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -690,7 +690,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
-                                  const __global int * dev_short_nbor, 
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 827613067c..341f663030 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -293,7 +293,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                                (BX/(JTHREADS*KTHREADS))));
 
   this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                    &this->dev_short_nbor,
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 89ae72df8a..89317b990c 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -702,7 +702,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
-                                  const __global int * dev_short_nbor, 
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,

From a71f5a0c20f7e23c3d5cfe2e73eff38aa4dcb5c3 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 22 Jul 2017 22:57:37 -0500
Subject: [PATCH 09/11] Enabled again neigh no with tpa > 1 for 3-body gpu
 styles for backward compatibility, could be slower than neigh no tpa 1 in
 many cases

---
 lib/gpu/lal_base_three.cpp |   1 -
 lib/gpu/lal_sw.cu          |  92 ++++++++++++++++----------
 lib/gpu/lal_tersoff.cu     | 131 ++++++++++++++++++++++---------------
 lib/gpu/lal_tersoff_mod.cu | 130 ++++++++++++++++++++++--------------
 lib/gpu/lal_tersoff_zbl.cu | 130 ++++++++++++++++++++++--------------
 lib/gpu/lal_vashishta.cu   |  82 ++++++++++++++---------
 6 files changed, 349 insertions(+), 217 deletions(-)

diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index 5f3c57337e..aa77a48c66 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -73,7 +73,6 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
     nbor->packing(true);
     _nbor_data=&(nbor->dev_packed);
-    _threads_per_atom = 1; // enforce tpa = 1 for now
   } else  // neigh yes or tpa == 1
     _nbor_data=&(nbor->dev_nbor);
   if (_threads_per_atom*_threads_per_atom>device->warp_size())
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index 7dea52898e..a5c9f49d08 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -167,7 +167,6 @@ __kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
@@ -217,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem = dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -227,13 +226,16 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor];
-    nbor += n_stride;
-    nbor_end = nbor+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_short_nbor[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -426,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -437,14 +439,17 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -466,15 +471,22 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
       sw_cut_ij=sw3_ijparam.x;
 
-      int nbor_k=nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int nbor_k,k_end;
+      if (dev_packed==dev_nbor) {
+        nbor_k=nborj_start-offset_j+offset_k;
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      } else {
+        nbor_k = nbor_j-offset_j+offset_k;
+        if (nbor_k<=nbor_j) nbor_k += n_stride;
+        k_end = nbor_end;
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
-        if (k <= j) continue;
+        if (dev_packed==dev_nbor && k <= j) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
@@ -558,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -569,12 +581,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -614,12 +629,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -707,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -718,12 +735,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -763,12 +783,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index 0026fb97cb..65c0fa49fc 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -271,9 +271,8 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -284,14 +283,17 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -312,11 +314,14 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -353,7 +358,8 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
       //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
       acc_zeta(z, tid, t_per_atom, offset_k);
@@ -426,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -436,13 +442,16 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor];
-    nbor += n_stride;
-    nbor_end = nbor+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_short_nbor[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -536,7 +545,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -547,14 +556,17 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -578,7 +590,8 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
       //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
@@ -602,11 +615,14 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -717,7 +733,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -730,13 +746,16 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -776,16 +795,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -808,7 +829,8 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
       //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
@@ -833,7 +855,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -875,7 +897,8 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
         //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
@@ -957,7 +980,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -970,13 +993,16 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1016,16 +1042,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -1048,7 +1076,8 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
       //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
@@ -1073,7 +1102,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1122,7 +1151,8 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
         //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
@@ -1152,7 +1182,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
         virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
-
       }
     } // for nbor
 
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 257d3972ed..3297bc74fb 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -271,9 +271,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -284,14 +283,17 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -312,11 +314,14 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -356,7 +361,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
       //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
       acc_zeta(z, tid, t_per_atom, offset_k);
@@ -427,8 +433,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -437,13 +443,16 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor];
-    nbor += n_stride;
-    nbor_end = nbor+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_short_nbor[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -540,7 +549,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -551,14 +560,17 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -582,7 +594,8 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
       //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
@@ -606,11 +619,14 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -727,7 +743,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -740,13 +756,16 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -786,16 +805,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -818,7 +839,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
       //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
@@ -843,7 +865,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -888,7 +910,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
         //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
@@ -976,7 +999,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -989,13 +1012,16 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1035,16 +1061,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -1067,7 +1095,8 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
       //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
@@ -1092,7 +1121,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1144,7 +1173,8 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
         //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 89317b990c..8fd5489543 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -277,9 +277,8 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -290,14 +289,17 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -318,11 +320,14 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -359,7 +364,8 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
       //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
       acc_zeta(z, tid, t_per_atom, offset_k);
@@ -440,8 +446,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -450,13 +456,16 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor];
-    nbor += n_stride;
-    nbor_end = nbor+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_short_nbor[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -556,7 +565,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -567,14 +576,17 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -598,7 +610,8 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
       //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j - n_stride;
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
@@ -622,11 +635,14 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -737,7 +753,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -750,13 +766,16 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -796,16 +815,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -828,7 +849,8 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
       //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
@@ -853,7 +875,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -895,7 +917,8 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
         //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
@@ -977,7 +1000,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -990,13 +1013,16 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1036,16 +1062,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -1068,7 +1096,8 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
       //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum - n_stride;
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
 //      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
@@ -1093,7 +1122,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1142,7 +1171,8 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
         //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k - n_stride;
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
 //        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
 //                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index d226aba6dd..26805588eb 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -224,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -234,6 +233,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -287,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
+
         if (eflag>0)
           energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
           
@@ -435,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -446,13 +447,17 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
     int nborj_start = nbor_j;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_short_nbor[nbor_j];
+
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -475,15 +480,22 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
       
-      int nbor_k=nborj_start-offset_j+offset_k;
-      int numk = dev_short_nbor[nbor_k-n_stride];
-      int k_end = nbor_k+fast_mul(numk,n_stride);
+      int nbor_k,k_end;
+      if (dev_packed==dev_nbor) {
+        nbor_k=nborj_start-offset_j+offset_k;
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      } else {
+        nbor_k = nbor_j-offset_j+offset_k;
+        if (nbor_k<=nbor_j) nbor_k += n_stride;
+        k_end = nbor_end;
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
-        if (k <= j) continue;
+        if (dev_packed==dev_nbor && k <= j) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
@@ -570,7 +582,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -581,12 +593,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -626,12 +641,14 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -721,7 +738,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -732,12 +749,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     // recalculate numj and nbor_end for use of the short nbor list
-    numj = dev_short_nbor[nbor_j];
-    nbor_j += n_stride;
-    nbor_end = nbor_j+fast_mul(numj,n_stride);
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_short_nbor[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -777,12 +797,14 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
       }
 
       // recalculate numk and k_end for the use of short neighbor list
-      numk = dev_short_nbor[nbor_k];
-      nbor_k += n_stride;
-      k_end = nbor_k+fast_mul(numk,n_stride);
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_short_nbor[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;

From aa60ef6ed8cf577c98690a9771ecb7894f5062ab Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 23 Jul 2017 00:08:55 -0500
Subject: [PATCH 10/11] Cleaned up 3-body kernels, reverted some mistaken
 changes to vashishta

---
 lib/gpu/lal_tersoff.cu     |  5 -----
 lib/gpu/lal_tersoff_mod.cu |  7 -------
 lib/gpu/lal_tersoff_zbl.cu |  7 -------
 lib/gpu/lal_vashishta.cu   | 10 +++-------
 4 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index 65c0fa49fc..cdeb5679d8 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -581,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -770,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -1017,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 3297bc74fb..576359b514 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -308,8 +308,6 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       // compute zeta_ij
       z = (acctyp)0;
 
@@ -585,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -780,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -1036,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 8fd5489543..e8bb017f59 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -314,8 +314,6 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       // compute zeta_ij
       z = (acctyp)0;
 
@@ -601,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -790,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -1037,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-//      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index 26805588eb..fa7f413aa5 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -474,9 +474,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
       
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij=param4_ijparam.x;
-
-//      if (rsq1 > param_r0sq_ij) continue;
-
+      if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
       
@@ -617,8 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij = param4_ijparam.x;
-
-//      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
 
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij = param4_ijparam.w;
@@ -773,8 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij=param4_ijparam.x;
-
-//      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue;  // still keep this for neigh no and tpa > 1
 
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;

From 3e9b41c6b74d7f72bb8326e5933420ec9d6dd736 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 9 Aug 2017 10:05:14 -0500
Subject: [PATCH 11/11] Added references to GPU package citations

---
 src/GPU/fix_gpu.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 22ec8dde3b..87db73bd12 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -71,6 +71,22 @@ static const char cite_gpu_package[] =
   " year =    2013,\n"
   " volume =  184,\n"
   " pages =   {2785--2793}\n"
+  "}\n\n"
+  "@Article{Trung15,\n"
+  " author = {T. D. Nguyen, S. J. Plimpton},\n"
+  " title = {Accelerating dissipative particle dynamics simulations for soft matter systems},\n"
+  " journal = {Comput.~Mater.~Sci.},\n"
+  " year =    2015,\n"
+  " volume =  100,\n"
+  " pages =   {173--180}\n"
+  "}\n\n"
+  "@Article{Trung17,\n"
+  " author = {T. D. Nguyen},\n"
+  " title = {GPU-accelerated Tersoff potentials for massively parallel Molecular Dynamics simulations},\n"
+  " journal = {Comp.~Phys.~Comm.},\n"
+  " year =    2017,\n"
+  " volume =  212,\n"
+  " pages =   {113--122}\n"
   "}\n\n";
 
 /* ---------------------------------------------------------------------- */