git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-07-01 23:27:26 +00:00
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions
--- a/lib/gpu/Makefile.lammps.mingw-cross
+++ b/lib/gpu/Makefile.lammps.mingw-cross
@ -1,6 +1,6 @@
 # Settings that the LAMMPS build will import when this package library is used
 # settings for OpenCL builds
 gpu_SYSINC =
-gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
+gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
 gpu_SYSPATH = 

--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -7,7 +7,7 @@

 EXTRAMAKE = Makefile.lammps.standard

-ifeq($(CUDA_HOME),)
+ifeq ($(CUDA_HOME),)
 CUDA_HOME = /usr/local/cuda
 endif

--- a/lib/gpu/Makefile.mingw32-cross
+++ b/lib/gpu/Makefile.mingw32-cross
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
        -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
 EXTRAMAKE = Makefile.lammps.mingw-cross
--- a/lib/gpu/Makefile.mingw32-cross-mpi
+++ b/lib/gpu/Makefile.mingw32-cross-mpi
@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
 	-I../../tools/mingw-cross/mpich2-win32/include/ \
        -DMPICH_IGNORE_CXX_SEEK
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/Makefile.mingw64-cross
+++ b/lib/gpu/Makefile.mingw64-cross
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
 OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
        -I$(CUDA_HOME)/include
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../src/STUBS -lmpi_mingw64
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/Makefile.mingw64-cross-mpi
+++ b/lib/gpu/Makefile.mingw64-cross-mpi
@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
 	-I../../tools/mingw-cross/mpich2-win64/include/ \
        -DMPICH_IGNORE_CXX_SEEK
 
-OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
+OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
 	-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
 OCL_PREC = -D_SINGLE_DOUBLE
 OCL_TUNE = -DFERMI_OCL
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
  cuDriverGetVersion(&driver_version);
  out << "CUDA Driver Version:                           "
      << driver_version/1000 << "." << driver_version%100
-		  << std::endl;
+                  << std::endl;
  #endif

  if (num_devices() == 0)
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
    gpu_nbor=1;
  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
    gpu_nbor=2;
+  _gpu_nbor=gpu_nbor;

  int _gpu_host=0;
  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
  if (!success)
    return NULL;

-  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+  // originally the requirement that nall == nlist was enforced
+  // to allow direct indexing neighbors of neighbors after re-arrangement
+//  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
+
+  // now the requirement is removed, allowing to work within pair hybrid
+  nbor->get_host(nlist,ilist,numj,firstneigh,block_size());

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
-                                         const int nall, double **host_x,
-                                         int *host_type, double *sublo,
-                                         double *subhi, tagint *tag,
-                                         int **nspecial, tagint **special,
-                                         bool &success) {
+                                       const int nall, double **host_x,
+                                       int *host_type, double *sublo,
+                                       double *subhi, tagint *tag,
+                                       int **nspecial, tagint **special,
+                                       bool &success) {
  success=true;
  resize_atom(inum,nall,success);
  resize_local(nall,host_inum,nbor->max_nbors(),success);
@ -214,7 +220,7 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall, 
+void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                         const int nlist, double **host_x, int *host_type,
                         int *ilist, int *numj, int **firstneigh,
                         const bool eflag, const bool vflag, const bool eatom,
@ -230,7 +236,7 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
  }

  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,nlocal,cpu_time);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
  ans->inum(inum);
  #ifdef THREE_CONCURRENT
  ans2->inum(inum);
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -205,6 +205,7 @@ class BaseThree {
 protected:
  bool _compiled;
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
+  int _gpu_nbor;
  double _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
+                                     host_d,host_offset);

  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
@ -113,7 +113,7 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
  this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
                         host_born1,host_born2,host_born3);
  this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
+                                     host_d,host_offset);
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -84,7 +84,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_d,host_offset);
+                         host_d,host_offset);

  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -84,7 +84,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		                     host_d,host_offset);
+                                     host_d,host_offset);

  cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
+                         host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@ -81,7 +81,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
+                         host_offset);

  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -83,7 +83,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
-		         host_offset);
+                         host_offset);

  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_cg_cmm.cpp
@ -80,7 +80,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,

  lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@ -117,9 +117,9 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
 }

 void crml_gpu_compute(const int ago, const int inum_full,
-	 	                  const int nall, double **host_x, int *host_type,
+                                   const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
-		                  const bool eflag, const bool vflag, const bool eatom,
+                                  const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
                      bool &success, double *host_q, const int nlocal,
                      double *boxlo, double *prd) {
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@ -51,7 +51,7 @@ int CoulLongT::init(const int ntypes, double **host_scale,
                    const double qqrd2e, const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-			                      gpu_split,_screen,coul_long,"k_coul_long");
+                                              gpu_split,_screen,coul_long,"k_coul_long");
  if (success!=0)
    return success;

--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@ -40,9 +40,9 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
  int init(const int ntypes, double **scale,
           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const double cell_size,
-	         const double gpu_split, FILE *screen,
-	         const double host_cut_coulsq, double *host_special_coul,
-	         const double qqrd2e, const double g_ewald);
+                 const double gpu_split, FILE *screen,
+                 const double host_cut_coulsq, double *host_special_coul,
+                 const double qqrd2e, const double g_ewald);

  /// Send updated coeffs from host to device (to be compatible with fix adapt)
  void reinit(const int ntypes, double **scale);
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@ -114,28 +114,28 @@ void cl_gpu_clear() {
 }

 int** cl_gpu_compute_n(const int ago, const int inum_full,
-		       const int nall, double **host_x, int *host_type,
-		       double *sublo, double *subhi, tagint *tag, int **nspecial,
-		       tagint **special, const bool eflag, const bool vflag,
-		       const bool eatom, const bool vatom, int &host_start,
-		       int **ilist, int **jnum,  const double cpu_time,
-		       bool &success, double *host_q, double *boxlo,
-		       double *prd) {
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum,  const double cpu_time,
+                       bool &success, double *host_q, double *boxlo,
+                       double *prd) {
  return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
-		      subhi, tag, nspecial, special, eflag, vflag, eatom,
-		      vatom, host_start, ilist, jnum, cpu_time, success,
-		      host_q, boxlo, prd);
+                      subhi, tag, nspecial, special, eflag, vflag, eatom,
+                      vatom, host_start, ilist, jnum, cpu_time, success,
+                      host_q, boxlo, prd);
 }

 void cl_gpu_compute(const int ago, const int inum_full, const int nall,
-		    double **host_x, int *host_type, int *ilist, int *numj,
-		    int **firstneigh, const bool eflag, const bool vflag,
-		    const bool eatom, const bool vatom, int &host_start,
-		    const double cpu_time, bool &success, double *host_q,
-		    const int nlocal, double *boxlo, double *prd) {
+                    double **host_x, int *host_type, int *ilist, int *numj,
+                    int **firstneigh, const bool eflag, const bool vflag,
+                    const bool eatom, const bool vatom, int &host_start,
+                    const double cpu_time, bool &success, double *host_q,
+                    const int nlocal, double *boxlo, double *prd) {
  CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
-	       firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-	       host_q,nlocal,boxlo,prd);
+               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+               host_q,nlocal,boxlo,prd);
 }

 double cl_gpu_bytes() {
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -650,7 +650,7 @@ int DeviceT::compile_kernels() {
  int flag=0;

  if (_compiled)
-  	return flag;
+          return flag;

  dev_program=new UCL_Program(*gpu);
  int success=dev_program->load_string(device,compile_string().c_str());
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -238,7 +238,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
          if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
            r3inv = r2inv*rinv;
            r5inv = r3inv*r2inv;
-	          r7inv = r5inv*r2inv;
+                  r7inv = r5inv*r2inv;
            pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
            pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
            pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
--- a/lib/gpu/lal_dpd.cpp
+++ b/lib/gpu/lal_dpd.cpp
@ -76,7 +76,7 @@ int DPDT::init(const int ntypes,

  coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma,
-			 host_sigma,host_cut);
+                         host_sigma,host_cut);

  UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
                             UCL_WRITE_ONLY);
@ -164,7 +164,7 @@ void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma,
  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
                               UCL_WRITE_ONLY);
  this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma,
-			 host_sigma,host_cut);
+                         host_sigma,host_cut);
 }

 template class DPD<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@ -75,7 +75,7 @@ int GaussT::init(const int ntypes,

  gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
-			                   host_cutsq,host_offset);
+                                           host_cutsq,host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -99,7 +99,7 @@ void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a,
    host_write[i]=0.0;

  this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b,
-			                   host_cutsq,host_offset);
+                                           host_cutsq,host_offset);
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@ -84,19 +84,19 @@ int GayBerneT::init(const int ntypes, const double gamma,

  sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
-			 host_sigma,host_epsilon);
+                         host_sigma,host_epsilon);

  this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
  dev_error.zero();
@ -209,7 +209,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));
      NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
-			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                         ELLIPSE_ELLIPSE,_shared_types,_lj_types);
      this->time_nbor1.stop();

      this->time_ellipsoid.start();
@ -242,7 +242,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
      NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
                                this->_last_ellipse)/BX));
      this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
-			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
+                                         SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
      this->time_nbor2.stop();

      this->time_ellipsoid2.start();
@ -300,7 +300,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
    NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
    this->time_nbor1.start();
    this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
-		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
    this->time_nbor1.stop();
    this->time_ellipsoid.start();
    this->k_ellipsoid.set_size(GX,BX);
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -26,58 +26,58 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
  den = ucl_recip(den);

  ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
-		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
-		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
-		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
-		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
+                    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
+                    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
+                    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
+                    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;

  ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
-		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
-		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
-		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
-		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
+                    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
+                    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
+                    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
+                    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;

  ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
-		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
-		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
-		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
-		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
+                    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
+                    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
+                    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
+                    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;

  ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
-		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
-		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
-		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
-		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
+                    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
+                    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
+                    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
+                    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;

  ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
-		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
-		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
-		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
-		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
+                    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
+                    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
+                    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
+                    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;

  ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
-		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
-		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
-		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
-		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
+                    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
+                    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
+                    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
+                    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;

  ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
-		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
-		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
-		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
-		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
+                    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
+                    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
+                    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
+                    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;

  ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
-		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
-		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
-		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
-		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
+                     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
+                     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
+                     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
+                     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;

  ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
-		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
-		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
-		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
-		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
+                    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
+                    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
+                    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
+                    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }

 __kernel void k_gayberne(const __global numtyp4 *restrict x_,
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@ -76,11 +76,11 @@ int LJT::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
+                         host_cutsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@ -76,11 +76,11 @@ int LJ96T::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
+                         host_cutsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@ -80,11 +80,11 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
+                         host_cutsq, host_cut_ljsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@ -79,11 +79,11 @@ int LJCoulT::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cut_ljsq, host_cut_coulsq);
+                         host_cut_ljsq, host_cut_coulsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@ -80,11 +80,11 @@ int LJCoulDebyeT::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			                   host_cut_ljsq, host_cut_coulsq);
+                                           host_cut_ljsq, host_cut_coulsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		                     host_offset);
+                                     host_offset);

  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@ -80,11 +80,11 @@ int LJCoulLongT::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-	   host_cutsq, host_cut_ljsq);
+           host_cutsq, host_cut_ljsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
--- a/lib/gpu/lal_lj_coul_msm.cpp
+++ b/lib/gpu/lal_lj_coul_msm.cpp
@ -81,11 +81,11 @@ int LJCoulMSMT::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_cut_ljsq);
+                         host_cutsq, host_cut_ljsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  // pack gcons and dgcons
  int nrows, ncols;
--- a/lib/gpu/lal_lj_cubic.cpp
+++ b/lib/gpu/lal_lj_cubic.cpp
@ -77,11 +77,11 @@ int LJCubicT::init(const int ntypes,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq);
+                         host_cutsq);

  lj2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj2,host_write,host_cut_inner_sq,
-			 host_cut_inner,host_sigma,host_epsilon);
+                         host_cut_inner,host_sigma,host_epsilon);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4);
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@ -84,11 +84,11 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cut_ljsq, host_cutsq);
+                         host_cut_ljsq, host_cutsq);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<4; i++) {
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@ -76,11 +76,11 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq, host_shift);
+                         host_cutsq, host_shift);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -82,9 +82,9 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
      int mtype=itype*lj_types+jtype;
      if (r2inv<lj1[mtype].z) {
        numtyp r = ucl_sqrt(r2inv);
-	numtyp rshift = r - lj1[mtype].w;
-	numtyp rshiftsq = rshift*rshift;
-	r2inv = ucl_recip(rshiftsq);
+        numtyp rshift = r - lj1[mtype].w;
+        numtyp rshiftsq = rshift*rshift;
+        r2inv = ucl_recip(rshiftsq);
        numtyp r6inv = r2inv*r2inv*r2inv;
        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
        force*=factor_lj/rshift/r;
@ -175,9 +175,9 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,

      if (r2inv<lj1[mtype].z) {
        numtyp r = ucl_sqrt(r2inv);
-	numtyp rshift = r - lj1[mtype].w;
-	numtyp rshiftsq = rshift*rshift;
-	r2inv = ucl_recip(rshiftsq);
+        numtyp rshift = r - lj1[mtype].w;
+        numtyp rshiftsq = rshift*rshift;
+        r2inv = ucl_recip(rshiftsq);
        numtyp r6inv = r2inv*r2inv*r2inv;
        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
        force*=factor_lj/rshift/r;
--- a/lib/gpu/lal_mie.cpp
+++ b/lib/gpu/lal_mie.cpp
@ -76,11 +76,11 @@ int MieT::init(const int ntypes, double **host_cutsq,

  mie1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,mie1,host_write,host_mie1,host_mie2,
-			                   host_gamA,host_gamR);
+                                           host_gamA,host_gamR);

  mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4,
-			                   host_offset,host_cutsq);
+                                           host_offset,host_cutsq);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -127,7 +127,10 @@ void Neighbor::alloc(bool &success) {
    dev_packed.clear();
    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
                                         _packed_permissions)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes();                                         
+    dev_acc.clear();
+    success=success && (dev_acc.alloc(_max_atoms,*dev,
+                                      UCL_READ_WRITE)==UCL_SUCCESS);
+    _c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
  }
  if (_max_host>0) {
    nbor_host.clear();
@ -194,6 +197,7 @@ void Neighbor::clear() {

    host_packed.clear();
    host_acc.clear();
+    dev_acc.clear();
    dev_nbor.clear();
    nbor_host.clear();
    dev_packed.clear();
@ -225,7 +229,7 @@ double Neighbor::host_memory_usage() const {
 }

 void Neighbor::get_host(const int inum, int *ilist, int *numj,
-                           int **firstneigh, const int block_size) {  
+                        int **firstneigh, const int block_size) {
  _nbor_time_avail=true;
  time_nbor.start();

@ -278,6 +282,15 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
  UCL_D_Vec<int> acc_view;
  acc_view.view_offset(inum,dev_nbor,inum*2);
  ucl_copy(acc_view,host_acc,true);
+
+  UCL_H_Vec<int> host_view;
+  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
+  for (int ii=0; ii<inum; ii++) {
+    int i=ilist[ii];
+    host_view[i] = ii;
+  }
+  ucl_copy(dev_acc,host_view,true);
+
  time_nbor.stop();

  if (_use_packing==false) {
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -199,6 +199,8 @@ class Neighbor {
  UCL_H_Vec<int> host_packed;
  /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
  UCL_H_Vec<int> host_acc;
+  /// Device storage for accessing atom indices from the neighbor list (3-body)
+  UCL_D_Vec<int> dev_acc;

  // ----------------- Data for GPU Neighbor Calculation ---------------

--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -118,24 +118,24 @@ __kernel void transpose(__global tagint *restrict out,
                        const __global tagint *restrict in,
                        int columns_in, int rows_in)
 {
-	__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
+        __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];

-	unsigned ti=THREAD_ID_X;
-	unsigned tj=THREAD_ID_Y;
-	unsigned bi=BLOCK_ID_X;
-	unsigned bj=BLOCK_ID_Y;
+        unsigned ti=THREAD_ID_X;
+        unsigned tj=THREAD_ID_Y;
+        unsigned bi=BLOCK_ID_X;
+        unsigned bj=BLOCK_ID_Y;

-	unsigned i=bi*BLOCK_CELL_2D+ti;
-	unsigned j=bj*BLOCK_CELL_2D+tj;
-	if ((i<columns_in) && (j<rows_in))
-		block[tj][ti]=in[j*columns_in+i];
+        unsigned i=bi*BLOCK_CELL_2D+ti;
+        unsigned j=bj*BLOCK_CELL_2D+tj;
+        if ((i<columns_in) && (j<rows_in))
+                block[tj][ti]=in[j*columns_in+i];

-	__syncthreads();
+        __syncthreads();

-	i=bj*BLOCK_CELL_2D+ti;
-	j=bi*BLOCK_CELL_2D+tj;
-	if ((i<rows_in) && (j<columns_in))
-		out[j*rows_in+i] = block[ti][tj];
+        i=bj*BLOCK_CELL_2D+ti;
+        j=bi*BLOCK_CELL_2D+tj;
+        if ((i<rows_in) && (j<columns_in))
+                out[j*rows_in+i] = block[ti][tj];
 }

 __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
@ -191,7 +191,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
      nbor_list[pid_i]=pid_i;
    } else {
      stride=0;
-    	neigh_counts=host_numj+pid_i-inum;
+            neigh_counts=host_numj+pid_i-inum;
      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
    }

@ -243,8 +243,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                }
              }
            }
-	          __syncthreads();
-	        } // for (k)
+                  __syncthreads();
+                } // for (k)
        }
      }
    }
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@ -51,7 +51,7 @@ void NeighborShared::clear() {
 void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
                                     const std::string flags) {
  if (_compiled)
-  	return;
+          return;

  _gpu_nbor=gpu_nbor;
  if (_gpu_nbor==0) {
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -270,19 +270,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
        int my=mz+fast_mul(ny,npts_x);
        for (int m=0; m<order; m++) {
          grdtyp y0=z0*rho1d_1[m][tid];
-  	      for (int l=0; l<order; l++) {
-  	        grdtyp x0=y0*rho1d_0[l][tid];
-  	        grdtyp4 el=brick[my+l];
-  	        ek.x-=x0*el.x;
-  	        ek.y-=x0*el.y;
-  	        ek.z-=x0*el.z;
-  	      }
+                for (int l=0; l<order; l++) {
+                  grdtyp x0=y0*rho1d_0[l][tid];
+                  grdtyp4 el=brick[my+l];
+                  ek.x-=x0*el.x;
+                  ek.y-=x0*el.y;
+                  ek.z-=x0*el.z;
+                }
          my+=npts_x;
        }
        mz+=npts_yx;
-  	  }
+            }
    }
    ans[ii]=ek;
-	}
+        }
 }

--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #define OCL_DEFAULT_VENDOR "generic"
 #endif

+#ifdef INTEL_OCL
+#define OCL_DEFAULT_VENDOR "intel"
+#endif
+
+#ifdef PHI_OCL
+#define OCL_DEFAULT_VENDOR "phi"
+#endif
+
 #ifndef OCL_DEFAULT_VENDOR
 #define OCL_DEFAULT_VENDOR "none"
 #endif
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@ -81,19 +81,19 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,

  sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
-			 host_sigma,host_epsilon);
+                         host_sigma,host_epsilon);

  this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);

  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
-			 host_cutsq,h_form);
+                         host_cutsq,h_form);

  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
-		         host_offset);
+                         host_offset);

  dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
  dev_error.zero();
@ -197,7 +197,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));
      NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
-			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                         ELLIPSE_ELLIPSE,_shared_types,_lj_types);
      this->time_nbor1.stop();

      this->time_ellipsoid.start();
@ -214,7 +214,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
      // ------------ ELLIPSE_SPHERE ---------------
      this->time_nbor2.start();
      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
-			                 ELLIPSE_SPHERE,_shared_types,_lj_types);
+                                         ELLIPSE_SPHERE,_shared_types,_lj_types);
      this->time_nbor2.stop();

      this->time_ellipsoid2.start();
@ -245,7 +245,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
      NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
                               this->_last_ellipse)/BX));
      this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
-			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
+                                         SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
      this->time_nbor3.stop();

      this->time_ellipsoid3.start();
@ -300,7 +300,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
    NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
    this->time_nbor1.start();
    this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
-		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
+                                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
    this->time_nbor1.stop();
    this->time_ellipsoid.start();
    this->k_ellipsoid.set_size(GX,BX);
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@ -74,7 +74,7 @@ int SoftT::init(const int ntypes, double **host_cutsq,

  coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_prefactor,
-			                   host_cut,host_cutsq);
+                                           host_cut,host_cutsq);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -98,7 +98,7 @@ void SoftT::reinit(const int ntypes, double **host_cutsq,
    host_write[i]=0.0;

  this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor,
-			                   host_cut,host_cutsq);
+                                           host_cut,host_cutsq);
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -196,11 +196,12 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

-  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1 
-  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1
+  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
+  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
  int ainum=this->ans->inum();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
+
  this->k_pair.set_size(GX,BX);
  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
                   &map, &elem2param, &_nelements,
@ -230,18 +231,21 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  }
+
  this->time_pair.stop();
 }

--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -195,7 +195,6 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
        numtyp sw_powerq=sw2_ijparam.w;
        numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
        numtyp sw_cut=sw3_ijparam.x;
-        numtyp sw_cutsq=sw3_ijparam.y;
        numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
            pow(sw_sigma,sw_powerp);
        numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
@ -345,7 +344,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                const int t_per_atom, const int evatom) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -394,8 +392,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
      if (rsq1 > sw3_ijparam.y) continue;

      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
        numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
        if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;

          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -467,14 +459,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
+                             const __global int * dev_acc,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
-                             const int t_per_atom) {
+                             const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -522,18 +514,20 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
      if (rsq1 > sw3_ijparam.y) continue;

      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k]; 
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,

        if (rsq2 < sw3_ikparam.y) {
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;

          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -607,14 +597,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
+                             const __global int * dev_acc,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
-                             const int t_per_atom) {
+                             const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;

@ -662,18 +652,20 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
      if (rsq1 > sw3_ijparam.y) continue;

      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma=sw1_ijparam.y;
-      sw_gamma=sw1_ijparam.w;
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k]; 
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,

        if (rsq2 < sw3_ikparam.y) {
          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma=sw1_ikparam.y;
-          sw_gamma=sw1_ikparam.w;
          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
          sw_cut_ik=sw3_ikparam.x;

          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_epsilon=sw1_ijkparam.x;
-          sw_lambda=sw1_ijkparam.z;
          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
  else
    _eflag=0;

-  int ainum=nall;
+  int ainum=nlist;
  int nbor_pitch=this->nbor->nbor_pitch();
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }

  this->time_pair.stop();
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
  tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,

  __syncthreads();

-  if (ii<nall) {
+  if (ii<inum) {
    int nbor_j, nbor_end;
    int i, numj;

@ -597,11 +597,12 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -666,13 +667,17 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -810,7 +815,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global numtyp4 *restrict ts1_in,
                                        const __global numtyp4 *restrict ts2_in,
-      	                                const __global numtyp4 *restrict ts4_in,
+                                        const __global numtyp4 *restrict ts4_in,
                                        const __global numtyp *restrict cutsq,
                                        const __global int *restrict map,
                                        const __global int *restrict elem2param,
@ -818,11 +823,12 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -887,13 +893,17 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -964,7 +974,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,

        numtyp delr2[3];
        delr2[0] = kx.x-jx.x;
-      	delr2[1] = kx.y-jx.y;
+        delr2[1] = kx.y-jx.y;
        delr2[2] = kx.z-jx.z;
        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@ -186,7 +186,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
  if (tmp > param_c2)
    return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
    // error in negligible 2nd term fixed 9/30/2015
-		// (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
+                // (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
      ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
       ucl_powr(tmp,-param_powern)));
  if (tmp < param_c4) return (numtyp)0.0;
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
  else
    _eflag=0;

-  int ainum=nall;
+  int ainum=nlist;
  int nbor_pitch=this->nbor->nbor_pitch();
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }

  this->time_pair.stop();
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
  tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,

  __syncthreads();

-  if (ii<nall) {
+  if (ii<inum) {
    int nbor_j, nbor_end;
    int i, numj;

@ -605,11 +605,12 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -676,13 +677,17 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -826,8 +831,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global numtyp4 *restrict ts1_in,
                                        const __global numtyp4 *restrict ts2_in,
-      	                                const __global numtyp4 *restrict ts4_in,
-      	                                const __global numtyp4 *restrict ts5_in,
+                                        const __global numtyp4 *restrict ts4_in,
+                                        const __global numtyp4 *restrict ts5_in,
                                        const __global numtyp *restrict cutsq,
                                        const __global int *restrict map,
                                        const __global int *restrict elem2param,
@ -835,11 +840,12 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -906,13 +912,17 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -983,7 +993,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,

        numtyp delr2[3];
        delr2[0] = kx.x-jx.x;
-      	delr2[1] = kx.y-jx.y;
+        delr2[1] = kx.y-jx.y;
        delr2[2] = kx.z-jx.z;
        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

--- a/lib/gpu/lal_tersoff_mod_extra.h
+++ b/lib/gpu/lal_tersoff_mod_extra.h
@ -180,12 +180,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
 {
  numtyp tmp = param_beta * zeta;
  if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) *
-	  ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
+          ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
  if (tmp < param_ca4) return (numtyp)0.0;

  numtyp tmp_n = ucl_powr(tmp,param_powern);
  return (numtyp)-0.5 *(param_powern/param_powern_del) *
-	  ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
+          ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
    ((numtyp)2.0*param_powern_del)))*tmp_n / zeta;
 }

--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
  else
    _eflag=0;

-  int ainum=nall;
+  int ainum=nlist;
  int nbor_pitch=this->nbor->nbor_pitch();
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);

  int evatom=0;
  if (eatom || vatom)
@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

  } else {
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->nbor->dev_acc,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }

  this->time_pair.stop();
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
-                             const int eflag, const int nall, const int inum,
+                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
  tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,

  __syncthreads();

-  if (ii<nall) {
+  if (ii<inum) {
    int nbor_j, nbor_end;
    int i, numj;

@ -617,11 +617,12 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_acc,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
-                                  const int t_per_atom) {
+                                  const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -686,13 +687,17 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -830,7 +835,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global numtyp4 *restrict ts1_in,
                                        const __global numtyp4 *restrict ts2_in,
-      	                                const __global numtyp4 *restrict ts4_in,
+                                        const __global numtyp4 *restrict ts4_in,
                                        const __global numtyp *restrict cutsq,
                                        const __global int *restrict map,
                                        const __global int *restrict elem2param,
@ -838,11 +843,12 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
+                                        const __global int * dev_acc,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom) {
+                                        const int t_per_atom, const int gpu_nbor) {
  __local int tpa_sq, n_stride;
  tpa_sq=fast_mul(t_per_atom,t_per_atom);
  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -907,13 +913,17 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
      mdelr1[1] = -delr1[1];
      mdelr1[2] = -delr1[2];

-      int nbor_k=j+nbor_pitch;
-      int numk=dev_nbor[nbor_k];
+      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
+        if (gpu_nbor) nbor_k=j+nbor_pitch;
+        else nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
        nbor_k+=offset_k;
      } else {
+        nbor_k=dev_acc[j]+nbor_pitch;
+        numk=dev_nbor[nbor_k];
        nbor_k+=nbor_pitch;
        nbor_k=dev_nbor[nbor_k];
        k_end=nbor_k+numk;
@ -984,7 +994,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,

        numtyp delr2[3];
        delr2[0] = kx.x-jx.x;
-      	delr2[1] = kx.y-jx.y;
+        delr2[1] = kx.y-jx.y;
        delr2[2] = kx.z-jx.z;
        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

--- a/lib/gpu/lal_tersoff_zbl_extra.h
+++ b/lib/gpu/lal_tersoff_zbl_extra.h
@ -212,7 +212,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
  if (tmp > param_c2)
    return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
    // error in negligible 2nd term fixed 9/30/2015
-		// (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
+                // (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
      ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
       ucl_powr(tmp,-param_powern)));
  if (tmp < param_c4) return (numtyp)0.0;
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@ -75,7 +75,7 @@ int YukawaT::init(const int ntypes,

  coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,host_offset,
-			 host_cutsq);
+                         host_cutsq);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@ -96,7 +96,7 @@ int YukawaColloidT::init(const int ntypes,

  coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
-			 host_offset,host_cutsq);
+                         host_offset,host_cutsq);

  UCL_H_Vec<double> dview;
  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -89,10 +89,10 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
      if (rsq<coeff[mtype].z) {
        numtyp r = ucl_sqrt(rsq);
        numtyp rinv = ucl_recip(r);
-	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
-	      numtyp force = coeff[mtype].x * screening;
+              numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+              numtyp force = coeff[mtype].x * screening;

-	      force = factor_lj*force * rinv;
+              force = factor_lj*force * rinv;

        f.x+=delx*force;
        f.y+=dely*force;
@ -181,10 +181,10 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
      if (rsq<coeff[mtype].z) {
        numtyp r = ucl_sqrt(rsq);
        numtyp rinv = ucl_recip(r);
-	      numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
-	      numtyp force = coeff[mtype].x * screening;
+              numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
+              numtyp force = coeff[mtype].x * screening;

-	      force = factor_lj*force * rinv;
+              force = factor_lj*force * rinv;

        f.x+=delx*force;
        f.y+=dely*force;
--- a/lib/gpu/lal_zbl.cpp
+++ b/lib/gpu/lal_zbl.cpp
@ -79,11 +79,11 @@ int ZBLT::init(const int ntypes, double **host_cutsq,

  coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_sw1,host_sw2,
-			                   host_zze, host_cutsq);
+                                           host_zze, host_cutsq);

  coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_d1a,host_d2a,
-			                   host_d3a,host_d4a);
+                                           host_d3a,host_d4a);

  coeff3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->type_pack4(ntypes,lj_types,coeff3,host_write,host_sw3,host_sw4,host_sw5);
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@ -134,10 +134,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                       coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);

-      	if (rsq>cut_innersq) {
-	        t = r - cut_inner;
-	        force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
-      	}
+              if (rsq>cut_innersq) {
+                t = r - cut_inner;
+                force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
+              }

        force *= (numtyp)-1.0*ucl_recip(r);

@ -148,10 +148,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                         coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-       	  e += coeff3[mtype].z;
-      	  if (rsq > cut_innersq) {
-      	    e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
-      	  }
+                 e += coeff3[mtype].z;
+                if (rsq > cut_innersq) {
+                  e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
+                }

          energy+=e;
        }
@ -237,10 +237,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
        force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
                       coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);

-      	if (rsq>cut_innersq) {
-	        t = r - cut_inner;
-	        force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
-      	}
+              if (rsq>cut_innersq) {
+                t = r - cut_inner;
+                force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
+              }

        force *= (numtyp)-1.0*ucl_recip(r);

@ -251,10 +251,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                         coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
-       	  e += coeff3[mtype].z;
-      	  if (rsq > cut_innersq) {
-      	    e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
-      	  }
+                 e += coeff3[mtype].z;
+                if (rsq > cut_innersq) {
+                  e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
+                }

          energy+=e;
        }