git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-07-01 23:27:26 +00:00
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions
--- a/lib/gpu/Makefile.lammps.mingw-cross
+++ b/lib/gpu/Makefile.lammps.mingw-cross
@ -1,6 +1,6 @@
 # Settings that the LAMMPS build will import when this package library is used
 # settings for OpenCL builds
 gpu_SYSINC =
-gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
+gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
 gpu_SYSPATH = 

--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -7,7 +7,7 @@

 EXTRAMAKE = Makefile.lammps.standard

-ifeq($(CUDA_HOME),)
+ifeq ($(CUDA_HOME),)
 CUDA_HOME = /usr/local/cuda
 endif

--- a/lib/gpu/Makefile.mingw32-cross
+++ b/lib/gpu/Makefile.mingw32-cross
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
        -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
 EXTRAMAKE = Makefile.lammps.mingw-cross
--- a/lib/gpu/Makefile.mingw32-cross-mpi
+++ b/lib/gpu/Makefile.mingw32-cross-mpi
@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
 	-I../../tools/mingw-cross/mpich2-win32/include/ \
        -DMPICH_IGNORE_CXX_SEEK
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/Makefile.mingw64-cross
+++ b/lib/gpu/Makefile.mingw64-cross
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
        -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../src/STUBS -lmpi_mingw64
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/Makefile.mingw64-cross-mpi
+++ b/lib/gpu/Makefile.mingw64-cross-mpi
@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-I../../tools/mingw-cross/mpich2-win64/include/ \
        -DMPICH_IGNORE_CXX_SEEK
 
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
    gpu_nbor=1;
  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
+  _gpu_nbor=gpu_nbor;

  int _gpu_host=0;
  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
  if (!success)
    return NULL;

-  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+  // originally the requirement that nall == nlist was enforced
+  // to allow direct indexing neighbors of neighbors after re-arrangement
+//  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+
+  // now the requirement is removed, allowing to work within pair hybrid
+  nbor->get_host(nlist,ilist,numj,firstneigh,block_size());

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
@ -214,7 +220,7 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, 
+void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                         const int nlist, double **host_x, int *host_type,
                         int *ilist, int *numj, int **firstneigh,
                         const bool eflag, const bool vflag, const bool eatom,
@ -230,7 +236,7 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
  }

  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,nlocal,cpu_time);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
  #ifdef THREE_CONCURRENT
  ans2->inum(inum);
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -205,6 +205,7 @@ class BaseThree {
 protected:
  bool _compiled;
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
+  int _gpu_nbor;
  double _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -127,7 +127,10 @@ void Neighbor::alloc(bool &success) {
    dev_packed.clear();
    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
                                         _packed_permissions)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes();                                         
+    dev_acc.clear();
+    success=success && (dev_acc.alloc(_max_atoms,*dev,
+                                      UCL_READ_WRITE)==UCL_SUCCESS);
+    _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
  }
  if (_max_host>0) {
    nbor_host.clear();
@ -194,6 +197,7 @@ void Neighbor::clear() {

    host_packed.clear();
    host_acc.clear();
+    dev_acc.clear();
    dev_nbor.clear();
    nbor_host.clear();
    dev_packed.clear();
@ -278,6 +282,15 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
  ucl_copy(acc_view,host_acc,true);
+
+  UCL_H_Vec<int> host_view;
+  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
+  for (int ii=0; ii<inum; ii++) {
+    int i=ilist[ii];
+    host_view[i] = ii;
+  }
+  ucl_copy(dev_acc,host_view,true);
+
  time_nbor.stop();

  if (_use_packing==false) {
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -199,6 +199,8 @@ class Neighbor {
  UCL_H_Vec<int> host_packed;
  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
  UCL_H_Vec<int> host_acc;
+  /// Device storage for accessing atom indices from the neighbor list (3-body)
+  UCL_D_Vec<int> dev_acc;

  // ----------------- Data for GPU Neighbor Calculation ---------------

--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #define OCL_DEFAULT_VENDOR "generic"
 #endif

+#ifdef INTEL_OCL
+#define OCL_DEFAULT_VENDOR "intel"
+#endif
+
+#ifdef PHI_OCL
+#define OCL_DEFAULT_VENDOR "phi"
+#endif
+
 #ifndef OCL_DEFAULT_VENDOR
 #define OCL_DEFAULT_VENDOR "none"
 #endif
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -196,11 +196,12 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

-  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1 
-  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1
+  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
+  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
+
  this->k_pair.set_size(GX,BX);
  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
                   &map, &elem2param, &_nelements,
@ -230,18 +231,21 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  }
+
  this->time_pair.stop();
 }

--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -195,7 +195,6 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
        numtyp sw_powerq=sw2_ijparam.w;
        numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
        numtyp sw_cut=sw3_ijparam.x;
-        numtyp sw_cutsq=sw3_ijparam.y;
        numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
            pow(sw_sigma,sw_powerp);
        numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
@ -345,7 +344,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                const int t_per_atom, const int evatom) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -394,8 +392,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
      if (rsq1 > sw3_ijparam.y) continue;

      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
        if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;

          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -467,14 +459,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
+                             const __global int * dev_acc,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
-                             const int t_per_atom) {
+                             const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -522,18 +514,20 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
      if (rsq1 > sw3_ijparam.y) continue;

      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k]; 
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,

        if (rsq2 < sw3_ikparam.y) {
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;

          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -607,14 +597,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
+                             const __global int * dev_acc,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
-                             const int t_per_atom) {
+                             const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -662,18 +652,20 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
      if (rsq1 > sw3_ijparam.y) continue;

      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k]; 
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,

        if (rsq2 < sw3_ikparam.y) {
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;

          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
  else
    _eflag=0;

-  int ainum=nall;
+  int ainum=nlist;
  int nbor_pitch=this->nbor->nbor_pitch();
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }

  this->time_pair.stop();
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
  tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,

  __syncthreads();

-  if (ii<nall) {
+  if (ii<inum) {
    int nbor_j, nbor_end;
    int i, numj;

@ -597,11 +597,12 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -666,13 +667,17 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -818,11 +823,12 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -887,13 +893,17 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
  else
    _eflag=0;

-  int ainum=nall;
+  int ainum=nlist;
  int nbor_pitch=this->nbor->nbor_pitch();
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }

  this->time_pair.stop();
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
  tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,

  __syncthreads();

-  if (ii<nall) {
+  if (ii<inum) {
    int nbor_j, nbor_end;
    int i, numj;

@ -605,11 +605,12 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -676,13 +677,17 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -835,11 +840,12 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -906,13 +912,17 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
  else
    _eflag=0;

-  int ainum=nall;
+  int ainum=nlist;
  int nbor_pitch=this->nbor->nbor_pitch();
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }

  this->time_pair.stop();
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
  tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,

  __syncthreads();

-  if (ii<nall) {
+  if (ii<inum) {
    int nbor_j, nbor_end;
    int i, numj;

@ -617,11 +617,12 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -686,13 +687,17 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -838,11 +843,12 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -907,13 +913,17 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;