Merge pull request #1009 from ndtrung81/gpu-maint

Fixed bugs to the tersoff gpu styles for OpenCL builds and some maintenance
2018-07-25 08:42:04 -06:00
parent 45532b7106 923ae041dc
commit c31d7c6f45
15 changed files with 77 additions and 62 deletions
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -260,6 +260,9 @@ class UCL_Device {
  /// List all devices along with all properties
  inline void print_all(std::ostream &out);

+  /// Select the platform that has accelerators (for compatibility with OpenCL)
+  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+
 private:
  int _device, _num_devices;
  std::vector<NVDProperties> _properties;
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -322,10 +322,12 @@ class Atom {

  // Copy charges to device asynchronously
  inline void add_q_data() {
+    time_q.start();
    if (_q_avail==false) {
      q.update_device(_nall,true);
      _q_avail=true;
    }
+    time_q.stop();
  }

  // Cast quaternions to write buffer
@ -347,10 +349,12 @@ class Atom {
  // Copy quaternions to device
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
+    time_quat.start();
    if (_quat_avail==false) {
      quat.update_device(_nall*4,true);
      _quat_avail=true;
    }
+    time_quat.stop();
  }

  /// Cast velocities and tags to write buffer
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -130,8 +130,16 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,

  // Time on the device only if 1 proc per gpu
  _time_device=true;
+
+#if 0
+  // XXX: the following setting triggers a memory leak with OpenCL and MPI
+  //      setting _time_device=true for all processes doesn't seem to be a
+  //      problem with either (no segfault, no (large) memory leak.
+  //      thus keeping this disabled for now. may need to review later.
+  //      2018-07-23 <akohlmey@gmail.com>
  if (_procs_per_gpu>1)
    _time_device=false;
+#endif

  // Set up a per device communicator
  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -127,10 +127,10 @@ void Neighbor::alloc(bool &success) {
    dev_packed.clear();
    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
                                         _packed_permissions)==UCL_SUCCESS);
-    dev_acc.clear();
-    success=success && (dev_acc.alloc(_max_atoms,*dev,
+    dev_ilist.clear();
+    success=success && (dev_ilist.alloc(_max_atoms,*dev,
                                      UCL_READ_WRITE)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
+    _c_bytes+=dev_packed.row_bytes()+dev_ilist.row_bytes();
  }
  if (_max_host>0) {
    nbor_host.clear();
@ -197,7 +197,7 @@ void Neighbor::clear() {

    host_packed.clear();
    host_acc.clear();
-    dev_acc.clear();
+    dev_ilist.clear();
    dev_nbor.clear();
    nbor_host.clear();
    dev_packed.clear();
@ -281,7 +281,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
  }
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,true);
+  ucl_copy(acc_view,host_acc,inum*2,true);

  UCL_H_Vec<int> host_view;
  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
@ -289,7 +289,7 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
    int i=ilist[ii];
    host_view[i] = ii;
  }
-  ucl_copy(dev_acc,host_view,true);
+  ucl_copy(dev_ilist,host_view,true);

  time_nbor.stop();

@ -364,7 +364,7 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
  }
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,true);
+  ucl_copy(acc_view,host_acc,inum*2,true);
  time_nbor.stop();

  if (_use_packing==false) {
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -110,7 +110,7 @@ class Neighbor {
      }
      if (_time_device) {
        time_nbor.add_to_total();
-        time_kernel.add_to_total();
+        if (_use_packing==false) time_kernel.add_to_total();
        if (_gpu_nbor==2) {
          time_hybrid1.add_to_total();
          time_hybrid2.add_to_total();
@ -200,7 +200,7 @@ class Neighbor {
  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
  UCL_H_Vec<int> host_acc;
  /// Device storage for accessing atom indices from the neighbor list (3-body)
-  UCL_D_Vec<int> dev_acc;
+  UCL_D_Vec<int> dev_ilist;

  // ----------------- Data for GPU Neighbor Calculation ---------------

--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -243,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -252,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -544,7 +544,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -614,13 +614,13 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -698,7 +698,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -768,13 +768,13 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@ -272,7 +272,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
-                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);

  ainum=this->ans->inum();
  nbor_pitch=this->nbor->nbor_pitch();
@ -311,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -320,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -696,7 +696,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
-                                  const __global int * dev_acc,
+                                  const __global int * dev_ilist,
                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
@ -777,13 +777,13 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -941,7 +941,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
-                                        const __global int * dev_acc,
+                                        const __global int * dev_ilist,
                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
@ -1022,13 +1022,13 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@ -272,7 +272,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
-                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);

  ainum=this->ans->inum();
  nbor_pitch=this->nbor->nbor_pitch();
@ -311,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -320,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -272,7 +272,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor_j, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -432,7 +432,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -547,7 +547,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -702,7 +702,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
-                                  const __global int * dev_acc,
+                                  const __global int * dev_ilist,
                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
@ -740,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -785,13 +785,13 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -956,7 +956,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
-                                        const __global int * dev_acc,
+                                        const __global int * dev_ilist,
                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
@ -994,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem = dev_packed;
+    const __global int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -1039,13 +1039,13 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@ -297,7 +297,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
-                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);

  ainum=this->ans->inum();
  nbor_pitch=this->nbor->nbor_pitch();
@ -337,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -346,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -278,7 +278,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor_j, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -445,7 +445,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int nbor, nbor_end, i, numj;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -563,7 +563,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -714,7 +714,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
-                                  const __global int * dev_acc,
+                                  const __global int * dev_ilist,
                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
@ -750,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem=dev_packed;
+    const __global int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -795,13 +795,13 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -959,7 +959,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
-                                        const __global int * dev_acc,
+                                        const __global int * dev_ilist,
                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
@ -995,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-    const int* nbor_mem = dev_packed;
+    const __global int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -1040,13 +1040,13 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@ -278,7 +278,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  } else {
@ -286,7 +286,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc, &this->dev_short_nbor,
+                          &this->nbor->dev_ilist, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -554,7 +554,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -623,13 +623,13 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
@ -709,7 +709,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const __global int * dev_acc,
+                             const __global int * dev_ilist,
                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
@ -778,13 +778,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_acc[j]+nbor_pitch;
+        else nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
-        nbor_k=dev_acc[j]+nbor_pitch;
+        nbor_k=dev_ilist[j]+nbor_pitch;
        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];