Misc Improvements to GPU Package

- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -319,7 +319,7 @@ CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
 THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
 BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
 BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
-PPPM_MAX_SPLINE.
+PPPM_MAX_SPLINE, NBOR_PREFETCH.

 CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
 (NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
--- a/lib/gpu/Makefile.oneapi
+++ b/lib/gpu/Makefile.oneapi
@ -12,13 +12,12 @@ EXTRAMAKE = Makefile.lammps.opencl
 LMP_INC = -DLAMMPS_SMALLBIG

 OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
-CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \
-          -qoverride-limits
-OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
+OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
 OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DGERYON_NO_OCL_MARKERS

 BIN_DIR = ./
 OBJ_DIR = ./
--- a/lib/gpu/Makefile.oneapi_prof
+++ b/lib/gpu/Makefile.oneapi_prof
@ -0,0 +1,28 @@
+# /* ----------------------------------------------------------------------   
+#  Linux Makefile for Intel oneAPI - Mixed precision (with timing enabled)
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.opencl
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
+OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
+          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
+OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
+OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -266,6 +266,7 @@ LAL_SERIALIZE_INIT      Force serialization of initialization and compilation
                        for multiple MPI tasks sharing the same accelerator.
                        Some accelerator API implementations have had issues
                        with temporary file conflicts in the past.
+LAL_DISABLE_PREFETCH    Disable prefetch in kernels
 GERYON_FORCE_SHARED_MAIN_MEM_ON      Should only be used for builds where the
                                     accelerator is guaranteed to share physical
                                     main memory with the host (e.g. integrated
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -429,7 +429,7 @@ void UCL_Device::clear() {
    CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context));
    CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device));
 #else
-    cuCtxDestroy(_context));
+    cuCtxDestroy(_context);
 #endif
  }
  _device=-1;
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -113,7 +113,7 @@ _texture( q_tex,int2);
    dufld[5]=red_acc[5][tid];                                               \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -147,7 +147,7 @@ _texture( q_tex,int2);
    _fieldp[5]=red_acc[5][tid];                                             \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -254,7 +254,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -277,7 +277,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -302,7 +302,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -391,7 +391,7 @@ _texture( q_tex,int2);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -416,9 +416,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
-                                 __global acctyp4 *restrict tep,
+                                 __global acctyp3 *restrict tep,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
                                 const int t_per_atom, const numtyp aewald,
@ -431,7 +431,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -440,7 +440,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

-  acctyp4 tq;
+  acctyp3 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  const __global numtyp4* polar1 = &extra[0];
@ -695,7 +695,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
+                                 __global acctyp3 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@ -889,7 +889,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
+                                 __global acctyp3 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@ -1052,9 +1052,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             const __global int *dev_short_nbor,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
-                             __global acctyp4 *restrict tep,
+                             __global acctyp3 *restrict tep,
                             const int eflag, const int vflag, const int inum,
                             const int nall, const int nbor_pitch, const int t_per_atom,
                             const numtyp aewald, const numtyp felec,
@ -1067,7 +1067,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),

 template <class numtyp, class acctyp>
 int AnswerT::bytes_per_atom() const {
-  int bytes=11*sizeof(acctyp);
+  int bytes=10*sizeof(acctyp);
  if (_rot)
-    bytes+=4*sizeof(acctyp);
+    bytes+=3*sizeof(acctyp);
  if (_charge)
    bytes+=sizeof(acctyp);
  return bytes;
@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {

  bool success=true;

-  _ans_fields=4;
+  _ans_fields=3;
  if (_rot)
-    _ans_fields+=4;
+    _ans_fields+=3;

  // ---------------------------  Device allocations
  success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
@ -134,11 +134,11 @@ void AnswerT::clear() {

 template <class numtyp, class acctyp>
 double AnswerT::host_memory_usage() const {
-  int atom_bytes=4;
+  int atom_bytes=3;
  if (_charge)
    atom_bytes+=1;
  if (_rot)
-    atom_bytes+=4;
+    atom_bytes+=3;
  int ans_bytes=atom_bytes+_ev_fields;
  return ans_bytes*(_max_local)*sizeof(acctyp)+
         sizeof(Answer<numtyp,acctyp>);
@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
  if (csize>0)
    engv.update_host(_ev_stride*csize,true);
  if (_rot)
-    force.update_host(_inum*4*2,true);
+    force.update_host(_inum*3*2,true);
  else
-    force.update_host(_inum*4,true);
+    force.update_host(_inum*3,true);
  time_answer.stop();

  #ifndef GERYON_OCL_FLUSH
@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
  if (_ilist==nullptr) {
-    typedef struct { double x,y,z; } vec3d;
-    typedef struct { acctyp x,y,z,w; } vec4d_t;
-    auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
-    auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
+    auto fp=reinterpret_cast<double*>(&(f[0][0]));

    #if (LAL_USE_OMP == 1)
    #pragma omp parallel
@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
      #if (LAL_USE_OMP == 1)
      const int nthreads = omp_get_num_threads();
      const int tid = omp_get_thread_num();
-      const int idelta = _inum / nthreads + 1;
+      const int idelta = _inum*3 / nthreads + 1;
      const int ifrom = tid * idelta;
-      const int ito = std::min(ifrom + idelta, _inum);
+      const int ito = std::min(ifrom + idelta, _inum*3);
      #else
      const int ifrom = 0;
-      const int ito = _inum;
+      const int ito = _inum*3;
      #endif

-      for (int i=ifrom; i<ito; i++) {
-        fp[i].x+=forcep[i].x;
-        fp[i].y+=forcep[i].y;
-        fp[i].z+=forcep[i].z;
-      }
+      for (int i=ifrom; i<ito; i++)
+        fp[i]+=force[i];
      if (_rot) {
-        auto torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
-        auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
-        for (int i=ifrom; i<ito; i++) {
-          torp[i].x+=torquep[i].x;
-          torp[i].y+=torquep[i].y;
-          torp[i].z+=torquep[i].z;
-        }
+        auto torp=reinterpret_cast<double*>(&(tor[0][0]));
+        auto torquep=&(force[_inum*3]);
+        for (int i=ifrom; i<ito; i++)
+          torp[i]+=torquep[i];
      }
    }
  } else {
@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
      const int idelta = _inum / nthreads + 1;
      const int ifrom = tid * idelta;
      const int ito = std::min(ifrom + idelta, _inum);
-      int fl=ifrom*4;
+      int fl=ifrom*3;
      #else
      const int ifrom = 0;
      const int ito = _inum;
@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
        f[ii][0]+=force[fl];
        f[ii][1]+=force[fl+1];
        f[ii][2]+=force[fl+2];
-        fl+=4;
+        fl+=3;
      }
      if (_rot) {
-        fl=_inum*4 + ifrom*4;
+        fl=_inum*3 + ifrom*3;
        for (int i=ifrom; i<ito; i++) {
          int ii=_ilist[i];
          tor[ii][0]+=force[fl];
          tor[ii][1]+=force[fl+1];
          tor[ii][2]+=force[fl+2];
-          fl+=4;
+          fl+=3;
        }
      }
    }
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -114,7 +114,7 @@ bool AtomT::alloc(const int nall) {
                                UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=q.device.row_bytes();
  }
-  if (_rot && !_host_view) {
+  if (_rot) {
    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=quat.device.row_bytes();
@ -182,12 +182,10 @@ bool AtomT::add_fields(const bool charge, const bool rot,
  if (rot && !_rot) {
    _rot=true;
    _other=true;
-    if (!_host_view) {
    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                   UCL_READ_ONLY)==UCL_SUCCESS);
    gpu_bytes+=quat.device.row_bytes();
  }
-  }

  if (vel && !_vel) {
    _vel=true;
@ -451,7 +449,7 @@ template <class numtyp, class acctyp>
 void AtomT::compile_kernels(UCL_Device &dev) {
  std::string flags = "";
  atom_program=new UCL_Program(dev);
-  atom_program->load_string(atom,flags,nullptr,screen);
+  atom_program->load_string(atom,flags.c_str(),nullptr,stderr);
  k_cast_x.set_function(*atom_program,"kernel_cast_x");
  _compiled=true;
 }
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -18,7 +18,7 @@
 #endif

 __kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
-                            const __global numtyp *restrict x,
+                            const __global double *restrict x,
                            const __global int *restrict type,
                            const int nall) {
  int ii=GLOBAL_ID_X;
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -52,6 +52,12 @@ using namespace ucl_cudadr;

 namespace LAMMPS_AL {

+struct EllipsoidBonus {
+  double shape[3];
+  double quat[4];
+  int ilocal;
+};
+
 template <class numtyp, class acctyp>
 class Atom {
 public:
@ -306,8 +312,8 @@ class Atom {
    if (_x_avail==false) {
      double t=MPI_Wtime();
      #ifdef GPU_CAST
-      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
      #else
      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
      vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
@ -351,6 +357,24 @@ class Atom {
    add_x_data(host_ptr,host_type);
  }

+  // Cast mu data to write buffer (stored in quat)
+  template<class cpytyp>
+  inline void cast_mu_data(cpytyp *host_ptr) {
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (sizeof(numtyp)==sizeof(double))
+        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
+      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
+        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
  // Cast charges to write buffer
  template<class cpytyp>
  inline void cast_q_data(cpytyp *host_ptr) {
@ -384,22 +408,24 @@ class Atom {
  }

  // Cast quaternions to write buffer
-  template<class cpytyp>
-  inline void cast_quat_data(cpytyp *host_ptr) {
+  inline void cast_quat_data(const int *ellipsoid,
+                             const EllipsoidBonus *bonus) {
    if (_quat_avail==false) {
      double t=MPI_Wtime();
-      if (_host_view) {
-        quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
-        quat.device.view(quat.host);
-      } else if (sizeof(numtyp)==sizeof(double))
-        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
-      else
      #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
      #pragma omp parallel for simd schedule(static)
      #elif (LAL_USE_OMP_SIMD == 1)
      #pragma omp simd
      #endif
-        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
+      for (int i=0; i<_nall; i++) {
+        int qi = ellipsoid[i];
+        if (qi > -1) {
+          quat[i*4] = bonus[qi].quat[0];
+          quat[i*4+1] = bonus[qi].quat[1];
+          quat[i*4+2] = bonus[qi].quat[2];
+          quat[i*4+3] = bonus[qi].quat[3];
+        }
+      }
      _time_cast+=MPI_Wtime()-t;
    }
  }
@ -419,10 +445,6 @@ class Atom {
  inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
    if (_v_avail==false) {
      double t=MPI_Wtime();
-      #ifdef GPU_CAST
-      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
-      #else
      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
      vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
      #if (LAL_USE_OMP == 1)
@ -434,7 +456,6 @@ class Atom {
        vp[i].z=host_p[i].z;
        vp[i].w=host_tag[i];
      }
-      #endif
      _time_cast+=MPI_Wtime()-t;
    }
  }
@ -444,16 +465,7 @@ class Atom {
  inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
    time_vel.start();
    if (_v_avail==false) {
-      #ifdef GPU_CAST
-      v_cast.update_device(_nall*3,true);
-      tag_cast.update_device(_nall,true);
-      int block_size=64;
-      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-      k_cast_x.set_size(GX,block_size);
-      k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
-      #else
      v.update_device(_nall*4,true);
-      #endif
      _v_avail=true;
    }
    time_vel.stop();
@ -519,7 +531,7 @@ class Atom {
  UCL_Vector<numtyp4,numtyp4> extra;

  #ifdef GPU_CAST
-  UCL_Vector<numtyp,numtyp> x_cast;
+  UCL_Vector<double,double> x_cast;
  UCL_Vector<int,int> type_cast;
  #endif

--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);

  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);

  _max_fieldp_size = _max_tep_size;
-  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);

  _max_thetai_size = 0;

@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,

  if (inum_full>_max_tep_size) {
    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _tep.resize(_max_tep_size*4);
+    _tep.resize(_max_tep_size*3);
  }
  *tep_ptr=_tep.host.begin();

@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,

  // copy tep from device to host

-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }

 // ---------------------------------------------------------------------------
@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double

  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)

-  _fieldp.update_host(_max_fieldp_size*8,false);
+  _fieldp.update_host(_max_fieldp_size*6,false);
 }

 // ---------------------------------------------------------------------------
@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
  // NOTE: move this step to update_fieldp() to delay device-host transfer
  //       after umutual1 and self are done on the GPU
  // *fieldp_ptr=_fieldp.host.begin();
-  // _fieldp.update_host(_max_fieldp_size*8,false);
+  // _fieldp.update_host(_max_fieldp_size*6,false);
 }

 // ---------------------------------------------------------------------------
@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
  device->add_ans_object(ans);

  // copy tep from device to host
-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }

 // ---------------------------------------------------------------------------
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -233,7 +233,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,

  atom->cast_x_data(host_x,host_type);
  atom->cast_q_data(host_q);
-  atom->cast_quat_data(host_mu[0]);
+  atom->cast_mu_data(host_mu[0]);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
  atom->add_q_data();
@ -297,12 +297,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
    if (!success)
      return nullptr;
    atom->cast_q_data(host_q);
-    atom->cast_quat_data(host_mu[0]);
+    atom->cast_mu_data(host_mu[0]);
    hd_balancer.start_timer();
  } else {
    atom->cast_x_data(host_x,host_type);
    atom->cast_q_data(host_q);
-    atom->cast_quat_data(host_mu[0]);
+    atom->cast_mu_data(host_mu[0]);
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -375,7 +375,8 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
                             const bool eflag_in, const bool vflag_in,
                             const bool eatom, const bool vatom,
                             int &host_start, const double cpu_time,
-                             bool &success, double **host_quat) {
+                             bool &success, const int *ellipsoid,
+                             const EllipsoidBonus *bonus) {
  acc_timers();
  int eflag, vflag;
  if (eflag_in) eflag=2;
@ -409,7 +410,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
    list=ilist;

  atom->cast_x_data(host_x,host_type);
-  atom->cast_quat_data(host_quat[0]);
+  atom->cast_quat_data(ellipsoid,bonus);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
  atom->add_quat_data();
@ -433,7 +434,8 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
                              const bool eatom, const bool vatom,
                              int &host_start, int **ilist, int **jnum,
                              const double cpu_time, bool &success,
-                              double **host_quat) {
+                              const int *ellipsoid,
+                              const EllipsoidBonus *bonus) {
  acc_timers();
  int eflag, vflag;
  if (eflag_in) eflag=2;
@ -460,11 +462,11 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return nullptr;
-    atom->cast_quat_data(host_quat[0]);
+    atom->cast_quat_data(ellipsoid,bonus);
    hd_balancer.start_timer();
  } else {
    atom->cast_x_data(host_x,host_type);
-    atom->cast_quat_data(host_quat[0]);
+    atom->cast_quat_data(ellipsoid,bonus);
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -170,7 +170,8 @@ class BaseEllipsoid {
               double **host_x, int *host_type, int *ilist, int *numj,
               int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double **quat);
+               const double cpu_time, bool &success,
+               const int *ellipsoid, const EllipsoidBonus *bonus);

  /// Pair loop with device neighboring
  int**compute(const int ago, const int inum_full, const int nall,
@ -179,7 +180,7 @@ class BaseEllipsoid {
               tagint **special, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
               int **ilist, int **numj, const double cpu_time, bool &success,
-               double **host_quat);
+               const int *ellipsoid, const EllipsoidBonus *bonus);

  // -------------------------- DEVICE DATA -------------------------

--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -31,7 +31,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -130,7 +131,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -150,7 +151,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
    beck2[tid]=beck2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -172,6 +173,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -32,7 +32,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -48,7 +48,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -123,7 +124,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -144,7 +145,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -166,6 +167,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -158,7 +159,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -183,7 +184,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -206,6 +207,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -51,7 +51,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -75,7 +75,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -95,6 +95,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -192,7 +193,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -217,7 +218,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -240,6 +241,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -38,7 +38,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -63,7 +63,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -89,6 +89,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -174,7 +175,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -200,7 +201,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -229,6 +230,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -39,7 +39,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -64,7 +64,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -90,6 +90,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -176,7 +177,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -202,7 +203,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -231,6 +232,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -31,7 +31,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag,  const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -120,7 +121,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -141,7 +142,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -163,6 +164,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -36,7 +36,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -151,7 +152,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -177,7 +178,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -200,6 +201,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -159,7 +160,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
                                    const __global numtyp *restrict sp_lj_in,
                                    const __global int *dev_nbor,
                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum, const int nbor_pitch,
@ -185,7 +186,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -208,6 +209,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_charmm.cu
+++ b/lib/gpu/lal_charmm.cu
@ -34,7 +34,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag,
                       const int inum, const int nbor_pitch,
@ -53,7 +53,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_bio();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -73,6 +73,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -159,7 +160,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag,
                            const int inum, const int nbor_pitch,
@ -187,7 +188,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -209,6 +210,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -35,7 +35,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch,
@ -50,7 +50,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_bio();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -70,6 +70,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -156,7 +157,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_lj_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum, const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -203,6 +204,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -34,7 +34,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
                        const __global int *form,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -50,7 +50,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -69,6 +69,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -188,7 +189,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
                             const __global int *form_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
@ -215,7 +216,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -237,6 +238,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -35,7 +35,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_cl_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch,
@ -54,7 +54,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -74,6 +74,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
@ -125,7 +126,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_cl_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -146,7 +147,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=_cutsq[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -169,6 +170,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -35,7 +35,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_cl_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@ -55,7 +55,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -75,6 +75,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
@ -129,7 +130,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_cl_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@ -153,7 +154,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=_cutsq[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -176,6 +177,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,

    numtyp factor_coul;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_coul = sp_cl[sbmask(j)];
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -36,7 +36,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch,
@ -56,7 +56,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -81,6 +81,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul, r, prefactor, erfcc;
@ -138,7 +139,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@ -156,7 +157,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -183,6 +184,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul, r, prefactor, erfcc;
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -35,7 +35,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_cl_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -54,7 +54,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -73,6 +73,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
@ -132,7 +133,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_cl_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -152,7 +153,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
    scale[tid]=scale_in[tid];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -174,6 +175,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@ -49,7 +49,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_cl_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -68,7 +68,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
  sp_cl[2]=sp_cl_in[2];
  sp_cl[3]=sp_cl_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -87,6 +87,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
    numtyp qtmp; fetch(qtmp,i,q_tex);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
@ -166,7 +167,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_cl_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -186,7 +187,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
    scale[tid]=scale_in[tid];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp e_coul, virial[6];
  if (EVFLAG) {
@ -208,6 +209,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_coul;
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -370,7 +370,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)

  _ocl_config_name="CUSTOM";
  int token_count=0;
-  std::string params[18];
+  std::string params[19];
  char ocl_config[2048];
  strncpy(ocl_config,s_config.c_str(),2047);
  char *pch = strtok(ocl_config,",");
@ -378,7 +378,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
  pch = strtok(nullptr,",");
  if (pch == nullptr) return -11;
  while (pch != nullptr) {
-    if (token_count==18)
+    if (token_count==19)
      return -11;
    params[token_count]=pch;
    token_count++;
@ -389,6 +389,16 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
  #ifdef CL_VERSION_2_0
  _ocl_compile_string+="-cl-std=CL2.0 ";
  #endif
+  if (params[0]=="500") {
+    _ocl_compile_string+="-DINTEL_OCL ";
+    #ifdef _DOUBLE_DOUBLE
+    // workaround for double precision with Intel OpenCL
+    params[4]="0";
+    #endif
+  }
+  #ifdef LAL_DISABLE_PREFETCH
+  params[18]="0";
+  #endif
  if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
  _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
    std::string(OCL_PRECISION_COMPILE);
@ -421,7 +431,8 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)

                         " -DMAX_SHARED_TYPES="+params[15]+
                         " -DMAX_BIO_SHARED_TYPES="+params[16]+
-                         " -DPPPM_MAX_SPLINE="+params[17];
+                         " -DPPPM_MAX_SPLINE="+params[17]+
+                         " -DNBOR_PREFETCH="+params[18];
  _ocl_compile_string += extra_args;
  #endif
  return 0;
@ -558,7 +569,11 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
    return -3;

  if (_user_cell_size<0.0) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
+    #else
    _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
+    #endif
  } else
    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
  nbor->set_cutoff(cutoff);
@ -954,7 +969,7 @@ int DeviceT::compile_kernels() {
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;

-  UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED);
+  UCL_Vector<int,int> gpu_lib_data(20,*gpu,UCL_NOT_PINNED);
  k_info.set_size(1,1);
  k_info.run(&gpu_lib_data);
  gpu_lib_data.update_host(false);
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@ -52,4 +52,5 @@ __kernel void kernel_info(__global int *info) {
  info[16]=MAX_SHARED_TYPES;
  info[17]=MAX_BIO_SHARED_TYPES;
  info[18]=PPPM_MAX_SPLINE;
+  info[19]=NBOR_PREFETCH;
 }
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -211,7 +211,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -235,7 +235,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -257,6 +257,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -282,8 +283,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
        numtyp rinv, r3inv, r5inv, r7inv;
        numtyp pre1, pre2, pre3, pre4;
        numtyp pdotp, pidotr, pjdotr;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -418,7 +419,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -445,7 +446,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -470,6 +471,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -494,8 +496,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
        numtyp rinv, r3inv, r5inv, r7inv;
        numtyp pre1, pre2, pre3, pre4;
        numtyp pdotp, pidotr, pjdotr;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -212,7 +212,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -236,7 +236,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -258,6 +258,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -286,8 +287,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
        numtyp4 aforcecoul, bforcecoul;

-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -450,7 +451,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -478,7 +479,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -503,6 +504,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -530,8 +532,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
        numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
        numtyp4 aforcecoul, bforcecoul;

-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@ -213,7 +213,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@ -238,7 +238,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -264,6 +264,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -291,8 +292,8 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
        numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
        numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
        numtyp fdx,fdy,fdz,fax,fay,faz;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@ -462,7 +463,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@ -490,7 +491,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
@ -519,6 +520,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -545,8 +547,8 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
        numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
        numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
        numtyp fdx,fdy,fdz,fax,fay,faz;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;

        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@ -168,7 +168,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
                    const __global numtyp *restrict sp_sqrt,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch,
@ -183,7 +183,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -203,6 +203,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,

    numtyp factor_dpd, factor_sqrt;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_dpd = sp_lj[sbmask(j)];
@ -284,7 +285,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_sqrt_in,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch,
@ -318,7 +319,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -343,6 +344,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
    numtyp factor_dpd, factor_sqrt;
    #endif
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      #ifndef ONETYPE
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -246,6 +246,7 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
    tfrho=type2frho[itype];

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

@ -332,6 +333,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
    #endif

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

@ -376,7 +378,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
                    const __global numtyp *cutsq,
                    const __global int *dev_nbor,
                    const __global int *dev_packed,
-                    __global acctyp4 *ans,
+                    __global acctyp3 *ans,
                    __global acctyp *engv,
                    const int eflag, const int vflag,  const int inum,
                    const int nbor_pitch, const int ntypes,
@ -388,7 +390,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_answers_eam();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -407,6 +409,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

@ -487,7 +490,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
                         const __global numtyp *cutsq,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *ans,
+                         __global acctyp3 *ans,
                         __global acctyp *engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const numtyp cutforcesq,
@ -510,7 +513,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
  int n_stride;
  local_allocate_store_answers_eam();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -532,6 +535,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
    #endif

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      j &= NEIGHMASK;

--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@ -152,7 +152,7 @@ _texture_2d( quat_tex,int4);
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -224,7 +224,7 @@ _texture_2d( quat_tex,int4);
        engv+=inum;                                                         \
      }                                                                     \
    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -30,7 +30,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@ -40,7 +40,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -59,6 +59,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -109,7 +110,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@ -127,7 +128,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
    gauss1[tid]=gauss1_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -149,6 +150,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -80,6 +80,9 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
                    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }

+#ifdef INTEL_OCL
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
 __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict q,
                         const __global numtyp4 *restrict shape,
@ -90,7 +93,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict lshape,
                         const __global int *dev_nbor,
                         const int stride,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         const int astride,
                         __global acctyp *restrict engv,
                         __global int *restrict err_flag,
@ -108,7 +111,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[5];
  sp_lj[3]=gum[6];

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, virial[6];
@ -138,6 +141,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@ -108,28 +108,33 @@ int** compute(const int ago, const int inum_full, const int nall,
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
+                const int *ellipsoid, const EllipsoidBonus *bonus);

 int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist,
-                       int **jnum, const double cpu_time, bool &success,
-                       double **host_quat) {
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus) {
  return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
                      tag, nspecial, special, eflag, vflag, eatom, vatom,
-                      host_start, ilist, jnum, cpu_time, success, host_quat);
+                      host_start, ilist, jnum, cpu_time, success,
+                      ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
+                     const double cpu_time, bool &success,
+                     const int *ellipsoid, const void *bonus) {
  return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                      numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
-                      cpu_time, success, host_quat);
+                      cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 // ---------------------------------------------------------------------------
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -34,7 +34,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                          const __global numtyp *restrict lshape,
                                          const __global int *dev_nbor,
                                          const int stride,
-                                          __global acctyp4 *restrict ans,
+                                          __global acctyp3 *restrict ans,
                                          __global acctyp *restrict engv,
                                          __global int *restrict err_flag,
                                          const int eflag, const int vflag,
@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[5];
  sp_lj[3]=gum[6];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -75,6 +75,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);

      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -259,7 +260,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict gum,
                            const int stride,
                            const __global int *dev_ij,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            __global int *restrict err_flag,
                            const int eflag, const int vflag, const int start,
@ -277,7 +278,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[5];
  sp_lj[3]=gum[6];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -296,6 +297,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -347,7 +349,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict gum,
                                 const int stride,
                                 const __global int *dev_ij,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 __global int *restrict err_flag,
                                 const int eflag, const int vflag,
@ -371,7 +373,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -393,6 +395,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -113,7 +113,7 @@ _texture( q_tex,int2);
    dufld[5]=red_acc[5][tid];                                               \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -147,7 +147,7 @@ _texture( q_tex,int2);
    _fieldp[5]=red_acc[5][tid];                                             \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -254,7 +254,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@ -277,7 +277,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
    f.x = _fieldp[0];                                                       \
    f.y = _fieldp[1];                                                       \
    f.z = _fieldp[2];                                                       \
@ -302,7 +302,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -391,7 +391,7 @@ _texture( q_tex,int2);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -416,9 +416,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
-                                __global acctyp4 *restrict tep,
+                                __global acctyp3 *restrict tep,
                                const int eflag, const int vflag, const int inum,
                                const int nall, const int nbor_pitch,
                                const int t_per_atom, const numtyp aewald,
@ -432,7 +432,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -441,7 +441,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

-  acctyp4 tq;
+  acctyp3 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  const __global numtyp4* polar1 = &extra[0];
@ -717,7 +717,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
@ -730,7 +730,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -895,9 +895,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
-                                __global acctyp4 *restrict tep,
+                                __global acctyp3 *restrict tep,
                                const int eflag, const int vflag, const int inum,
                                const int nall, const int nbor_pitch,
                                const int t_per_atom, const numtyp aewald,
@ -910,7 +910,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -919,7 +919,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

-  acctyp4 tq;
+  acctyp3 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  const __global numtyp4* polar1 = &extra[0];
@ -1210,7 +1210,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict fieldp,
+                                __global acctyp3 *restrict fieldp,
                                const int inum,  const int nall,
                                const int nbor_pitch, const int t_per_atom,
                                const numtyp aewald, const numtyp off2,
@ -1390,7 +1390,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict fieldp,
+                                __global acctyp3 *restrict fieldp,
                                const int inum,  const int nall,
                                const int nbor_pitch, const int t_per_atom,
                                const numtyp aewald, const numtyp off2,
@ -1541,9 +1541,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            const __global int *dev_short_nbor,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
-                            __global acctyp4 *restrict tep,
+                            __global acctyp3 *restrict tep,
                            const int eflag, const int vflag, const int inum,
                            const int nall, const int nbor_pitch, const int t_per_atom,
                            const numtyp aewald, const numtyp felec,
@ -1556,7 +1556,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -31,7 +31,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
                   const __global numtyp *restrict sp_lj,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom) {
@ -41,7 +41,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -59,6 +59,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -110,7 +111,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int * dev_nbor,
                        const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -144,7 +145,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -166,6 +167,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,

    NOUNROLL
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      #ifndef ONETYPE
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -31,7 +31,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -118,7 +119,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -139,7 +140,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -161,6 +162,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag,  const int vflag,
                               const int inum, const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -156,7 +157,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
                                    const __global numtyp *restrict sp_lj_in,
                                    const __global int *dev_nbor,
                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum, const int nbor_pitch,
@ -182,7 +183,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -205,6 +206,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -36,7 +36,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -147,7 +148,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -173,7 +174,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -196,6 +197,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -36,7 +36,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch,
@ -60,7 +60,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -80,6 +80,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -154,7 +155,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -204,6 +205,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -154,7 +155,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -178,7 +179,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -201,6 +202,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@ -94,7 +94,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -117,7 +117,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -139,6 +139,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
    numtyp cut_coul = ucl_sqrt(cut_coulsq);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -215,7 +216,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_lj_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
@ -239,7 +240,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -264,6 +265,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
    numtyp cut_coul = ucl_sqrt(cut_coulsq);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@ -39,7 +39,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -132,7 +133,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
@ -155,7 +156,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -176,6 +177,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -38,7 +38,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch,
@ -62,7 +62,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -88,6 +88,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
@ -165,7 +166,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch,
@ -190,7 +191,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -219,6 +220,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul, r, prefactor, erfcc;
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -33,7 +33,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -122,7 +123,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom) {
@ -143,7 +144,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -165,6 +166,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -158,7 +159,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -181,7 +182,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
    lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -204,6 +205,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@ -34,7 +34,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@ -50,7 +50,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -134,7 +135,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom) {
@ -156,7 +157,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
    ljsw[tid]=ljsw_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -177,6 +178,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_smooth.cu
+++ b/lib/gpu/lal_lj_smooth.cu
@ -33,7 +33,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
                   const __global numtyp *restrict sp_lj,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom) {
@ -43,7 +43,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -63,6 +63,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
    numtyp r, t, tsq, fskin;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -135,7 +136,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int * dev_nbor,
                        const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -169,7 +170,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -194,6 +195,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,

    NOUNROLL
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];
      #ifndef ONETYPE
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_spica.cu
+++ b/lib/gpu/lal_lj_spica.cu
@ -31,7 +31,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -128,7 +129,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
@ -149,7 +150,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -171,6 +172,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_lj_spica_long.cu
+++ b/lib/gpu/lal_lj_spica_long.cu
@ -36,7 +36,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag,  const int vflag, const int inum,
                            const int nbor_pitch,
@ -59,7 +59,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -79,6 +79,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
@ -166,7 +167,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_lj_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum, const int nbor_pitch,
@ -189,7 +190,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
    lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
@ -212,6 +213,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj, factor_coul;
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@ -143,7 +143,7 @@ ucl_inline void compute_newsite(int iO, int  iH1, int  iH2,
 ---------------------------------------------------------------------- */
 __kernel void k_lj_tip4p_long_distrib(
    const __global numtyp4 *restrict x_,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
    __global acctyp *restrict engv,
    const int eflag, const int vflag, const int inum,
    const int nbor_pitch, const int t_per_atom,
@ -155,7 +155,7 @@ __kernel void k_lj_tip4p_long_distrib(
    const __global acctyp4 *restrict ansO) {

  int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X;
-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;

  if (i<inum) {
@ -208,7 +208,7 @@ __kernel void k_lj_tip4p_long_distrib(
        engv[inum*engv_iter + i] += vM.z * (acctyp)(1 - alpha);
      }
    }
-    acctyp4 old=ans[i];
+    acctyp3 old=ans[i];
    old.x+=f.x;
    old.y+=f.y;
    old.z+=f.z;
@ -325,7 +325,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
    const __global numtyp *restrict sp_lj,
    const __global int * dev_nbor,
    const __global int * dev_packed,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
    __global acctyp *restrict engv,
    const int eflag, const int vflag, const int inum,
    const int nbor_pitch, const int t_per_atom,
@ -344,7 +344,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_charge();

-  acctyp4 f, fO;
+  acctyp3 f;
+  acctyp4 fO;
  f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
  fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
  acctyp energy, e_coul, virial[6], vO[6];
@ -386,6 +387,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj,factor_coul;
@ -470,7 +472,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
            e_coul += prefactor*(_erfc-factor_coul);
          }
          if (EVFLAG && vflag) {
-            acctyp4 fd;
+            acctyp3 fd;
            fd.x = delx*force_coul;
            fd.y = dely*force_coul;
            fd.z = delz*force_coul;
@ -645,7 +647,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
    const __global numtyp *restrict sp_lj_in,
    const __global int * dev_nbor,
    const __global int * dev_packed,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
    __global acctyp *restrict engv,
    const int eflag, const int vflag, const int inum,
    const int nbor_pitch, const int t_per_atom,
@ -674,7 +676,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
    if (EVFLAG && eflag)
      lj3[tid]=lj3_in[tid];
  }
-  acctyp4 f, fO;
+  acctyp3 f;
+  acctyp4 fO;
  f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
  fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
  acctyp energy, e_coul, virial[6], vO[6];
@ -717,6 +720,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
    }

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
      int j=dev_packed[nbor];

      numtyp factor_lj,factor_coul;
@ -801,7 +805,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
            e_coul += prefactor*(_erfc-factor_coul);
          }
          if (EVFLAG && vflag) {
-            acctyp4 fd;
+            acctyp3 fd;
            fd.x = delx*force_coul;
            fd.y = dely*force_coul;
            fd.z = delz*force_coul;
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@ -31,7 +31,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
                    const __global numtyp *restrict sp_lj_in,
                    const __global int *dev_nbor,
                    const __global int *dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
@ -47,7 +47,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -66,6 +66,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -119,7 +120,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@ -139,7 +140,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
    mie3[tid]=mie3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -161,6 +162,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -33,7 +33,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -120,7 +121,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@ -141,7 +142,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
      mor2[tid]=mor2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -163,6 +164,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,

 #endif

+#define SPECIAL_DATA_PRELOAD_SIZE 3
+#define UNROLL_FACTOR_LIST 4
+#define UNROLL_FACTOR_SPECIAL 2
+
 __kernel void kernel_special(__global int *dev_nbor,
                             __global int *host_nbor_list,
                             const __global int *host_numj,
@ -526,23 +530,68 @@ __kernel void kernel_special(__global int *dev_nbor,
      list_end=list+fast_mul(numj,stride);
    }

-    for ( ; list<list_end; list+=stride) {
-      int nbor=*list;
-      tagint jtag=tag[nbor];
-
-      int offset=ii;
-      for (int i=0; i<n3; i++) {
-        if (special[offset]==jtag) {
-          int which = 1;
-          if (i>=n1)
-            which++;
-          if (i>=n2)
-            which++;
-          nbor=nbor ^ (which << SBBITS);
-          *list=nbor;
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+    tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
+    for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
+      special_preload[j] = special[ii + i*nt];
    }
-        offset+=nt;
+#endif
+
+    for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
+      int nbor[UNROLL_FACTOR_LIST];
+      tagint jtag[UNROLL_FACTOR_LIST];
+      __global int* list_addr[UNROLL_FACTOR_LIST];
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        list_addr[l] = list + l*stride;
+        nbor[l] = *list_addr[l];
+      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        jtag[l] = tag[nbor[l]];
+      }
+
+      for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
+        tagint special_data[UNROLL_FACTOR_SPECIAL];
+        int which[UNROLL_FACTOR_SPECIAL];
+
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          which[c] = 1;
+          if (i + c < n3)
+          {
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+            if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
+              special_data[c] = special_preload[j];
+            }
+            else
+#endif
+              special_data[c] = special[ii + (i+c)*nt];
+          }
+        }
+
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n1) {
+            which[k]++;
+          }
+        }
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n2) {
+            which[k]++;
+          }
+          which[k] <<= SBBITS;
+        }
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          if (i + c < n3) {
+            for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+              if (special_data[c] == jtag[l]) {
+                nbor[l]=nbor[l] ^ which[c];
+              }
+            }
+          }
+        }
+      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        *list_addr[l] = nbor[l];
      }
    }
  } // if ii
 }
+
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -217,7 +217,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
                     const grdtyp delxinv,  const grdtyp delyinv,
                     const grdtyp delzinv, const int order,
                     const int order2, const grdtyp qqrd2e_scale,
-                     __global acctyp4 *restrict ans) {
+                     __global acctyp3 *restrict ans) {
  __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
  __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
  __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
@ -239,7 +239,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
    fetch(qs,ii,q_tex);
    qs*=qqrd2e_scale;

-    acctyp4 ek;
+    acctyp3 ek;
    ek.x=(acctyp)0.0;
    ek.y=(acctyp)0.0;
    ek.z=(acctyp)0.0;
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@ -57,6 +57,7 @@
 #define MAX_SHARED_TYPES 11
 #define MAX_BIO_SHARED_TYPES 128
 #define PPPM_MAX_SPLINE 8
+#define NBOR_PREFETCH 0

 // -------------------------------------------------------------------------
 //                              KERNEL MACROS
--- a/lib/gpu/lal_pre_ocl_config.h
+++ b/lib/gpu/lal_pre_ocl_config.h
@ -23,7 +23,7 @@
 //   THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
 //   BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
 //   BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
-//   PPPM_MAX_SPLINE}
+//   PPPM_MAX_SPLINE, NBOR_PREFETCH}
 //
 //*************************************************************************/

@ -39,15 +39,15 @@ const char * ocl_config_names[] =
  };
 const char * ocl_config_strings[] =
  {
-   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8",
-   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
-   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
+   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8,0",
+   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
+   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
 #ifdef _SINGLE_SINGLE
-   "INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
-   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
+   "INTEL_GPU,500,8,32,1,1,4,8,2,128,128,128,128,64,8,128,8,128,8,2",
+   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8,0",
 #else
-   "INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
-   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
+   "INTEL_GPU,500,8,32,1,1,2,8,2,128,128,128,128,64,8,128,8,128,8,2",
+   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8,0",
 #endif
-   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8"
+   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8,0"
  };
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@ -57,6 +57,10 @@ struct _lgpu_float2 {
  float x; float y;
 };

+struct _lgpu_float3 {
+  float x; float y; float z;
+};
+
 struct _lgpu_float4 {
  float x; float y; float z; float w;
 };
@ -65,6 +69,10 @@ struct _lgpu_double2 {
  double x; double y;
 };

+struct _lgpu_double3 {
+  double x; double y; double z;
+};
+
 struct _lgpu_double4 {
  double x; double y; double z; double w;
 };
@ -75,6 +83,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
  return out;
 }

+inline std::ostream & operator<<(std::ostream &out, const _lgpu_float3 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
  out << v.x << " " << v.y << " " << v.z;
  return out;
@ -85,6 +98,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
  return out;
 }

+inline std::ostream & operator<<(std::ostream &out, const _lgpu_double3 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
  out << v.x << " " << v.y << " " << v.z;
  return out;
@ -97,8 +115,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION float
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_float2
+#define numtyp3 _lgpu_float3
 #define numtyp4 _lgpu_float4
 #define acctyp2 _lgpu_double2
+#define acctyp3 _lgpu_double3
 #define acctyp4 _lgpu_double4
 #endif

@ -107,8 +127,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION double
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_double2
+#define numtyp3 _lgpu_double3
 #define numtyp4 _lgpu_double4
 #define acctyp2 _lgpu_double2
+#define acctyp3 _lgpu_double3
 #define acctyp4 _lgpu_double4
 #endif

@ -117,8 +139,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION float
 #define ACC_PRECISION float
 #define numtyp2 _lgpu_float2
+#define numtyp3 _lgpu_float3
 #define numtyp4 _lgpu_float4
 #define acctyp2 _lgpu_float2
+#define acctyp3 _lgpu_float3
 #define acctyp4 _lgpu_float4
 #endif

--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -93,6 +93,13 @@
 //     Definition:   Maximum order for splines in PPPM
 //     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
 //
+//  NBOR_PREFETCH
+//     Definition:   Control use of prefetch for neighbor indices
+//                   0 = No prefetch
+//                   1 = Prefetch using standard API
+//                   2 = Prefetch using Intel intrinsics
+//     Restrictions: NBOR_PREFETCH forced to 0 when LAL_DISABLE_PREFETCH
+//                   is defined in library build
 //*************************************************************************/

 // -------------------------------------------------------------------------
@ -101,6 +108,7 @@

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_pre_cuda_hip.h"
+#define ucl_prefetch(p)
 #define ucl_pow pow
 #endif

@ -169,7 +177,7 @@
 #define ucl_abs fabs
 #define ucl_erfc erfc

-#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
+#if (FAST_MATH > 0) && !defined(_DOUBLE_DOUBLE)

 #define ucl_exp native_exp
 #define ucl_pow pow
@ -285,6 +293,55 @@
  #define simd_size() SIMD_SIZE
 #endif

+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - PREFETCH
+// -------------------------------------------------------------------------
+
+#if (NBOR_PREFETCH == 0)
+#define ucl_prefetch(p)
+#endif
+
+#if (NBOR_PREFETCH == 1)
+inline void ucl_prefetch(const __global int *p) {
+  prefetch(p, 1);
+}
+#endif
+
+#if (NBOR_PREFETCH == 2)
+// Load message caching control
+enum LSC_LDCC {
+  LSC_LDCC_DEFAULT,
+  LSC_LDCC_L1UC_L3UC,   //1 Override to L1 uncached and L3 uncached
+  LSC_LDCC_L1UC_L3C,    //1 Override to L1 uncached and L3 cached
+  LSC_LDCC_L1C_L3UC,    //1 Override to L1 cached and L3 uncached
+  LSC_LDCC_L1C_L3C,     //1 Override to L1 cached and L3 cached
+  LSC_LDCC_L1S_L3UC,    //1 Override to L1 streaming load and L3 uncached
+  LSC_LDCC_L1S_L3C,     //1 Override to L1 streaming load and L3 cached
+  LSC_LDCC_L1IAR_L3C,   //1 Override to L1 invalidate-after-read, and L3 cached
+};
+
+void __builtin_IB_lsc_prefetch_global_uint(const __global uint *base,
+                                           int elemOff,
+                                           enum LSC_LDCC cacheOpt); //D32V1
+
+inline void ucl_prefetch(const __global int *p) {
+  __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0,
+                                        LSC_LDCC_L1C_L3UC);
+}
+#endif
+
+struct _lgpu_float3 {
+  float x; float y; float z;
+};
+struct _lgpu_double3 {
+  double x; double y; double z;
+};
+#ifdef _SINGLE_SINGLE
+#define acctyp3 struct _lgpu_float3
+#else
+#define acctyp3 struct _lgpu_double3
+#endif
+
 // -------------------------------------------------------------------------
 //                            END OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
@ -301,6 +358,9 @@
 #define numtyp4 double4
 #define acctyp double
 #define acctyp2 double2
+#ifndef acctyp3
+#define acctyp3 double3
+#endif
 #define acctyp4 double4
 #endif

@ -310,6 +370,9 @@
 #define numtyp4 float4
 #define acctyp double
 #define acctyp2 double2
+#ifndef acctyp3
+#define acctyp3 double3
+#endif
 #define acctyp4 double4
 #endif

@ -319,6 +382,9 @@
 #define numtyp4 float4
 #define acctyp float
 #define acctyp2 float2
+#ifndef acctyp3
+#define acctyp3 float3
+#endif
 #define acctyp4 float4
 #endif

--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -32,6 +32,9 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
  return ans;
 }

+#ifdef INTEL_OCL
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
 __kernel void k_resquared(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict q,
                          const __global numtyp4 *restrict shape,
@ -41,7 +44,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
                          const int ntypes,
                          const __global int *dev_nbor,
                          const int stride,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          const int astride,
                          __global acctyp *restrict engv,
                          __global int *restrict err_flag,
@ -62,7 +65,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
  const numtyp cr60=ucl_cbrt((numtyp)60.0);

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, virial[6];
@ -122,6 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@ -105,28 +105,32 @@ void re_gpu_clear() {
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
+                const int *ellipsoid, const EllipsoidBonus *bonus);

 int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist,
-                       int **jnum, const double cpu_time, bool &success,
-                       double **host_quat) {
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus) {
  return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
                      tag, nspecial, special, eflag, vflag, eatom, vatom,
-                      host_start, ilist, jnum, cpu_time, success, host_quat);
+                      host_start, ilist, jnum, cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 int * re_gpu_compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, int *ilist, int *numj,
                     int **firstneigh, const bool eflag, const bool vflag,
                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
+                     const double cpu_time, bool &success,
+                     const int *ellipsoid, const void *bonus) {
  return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                      numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
-                      cpu_time, success, host_quat);
+                      cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }

 // ---------------------------------------------------------------------------
@ -135,4 +139,3 @@ int * re_gpu_compute(const int ago, const int inum_full, const int nall,
 double re_gpu_bytes() {
  return REMF.host_memory_usage();
 }
-
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -86,7 +86,7 @@
        ap1+=astride;                                                        \
      }                                                                      \
    }                                                                        \
-    acctyp4 old=ans[ii];                                                     \
+    acctyp3 old=ans[ii];                                                     \
    old.x+=f.x;                                                              \
    old.y+=f.y;                                                              \
    old.z+=f.z;                                                              \
@ -131,7 +131,7 @@
        ap1+=astride;                                                       \
      }                                                                     \
    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
                                           const int ntypes,
                                           const __global int *dev_nbor,
                                           const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                           const int astride,
                                           __global acctyp *restrict engv,
                                           __global int *restrict err_flag,
@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
  const numtyp solv_f_r =
     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);

-  acctyp4 f, tor;
+  acctyp3 f, tor;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
  acctyp energy, virial[6];
@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int j=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -409,7 +410,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                           const int ntypes,
                                           const __global int *dev_nbor,
                                           const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                           __global acctyp *restrict engv,
                                           __global int *restrict err_flag,
                                           const int eflag, const int vflag,
@ -435,7 +436,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
  const numtyp solv_f_r =
    (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -454,6 +455,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
      int i=dev_nbor[nbor];
      factor_lj = sp_lj[sbmask(i)];
      i &= NEIGHMASK;
@ -610,7 +612,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict gum,
                             const int stride,
                             const __global int *dev_ij,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             __global int *restrict err_flag,
                             const int eflag, const int vflag, const int start,
@ -628,7 +630,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
  sp_lj[2]=gum[2];
  sp_lj[3]=gum[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -647,6 +649,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -697,7 +700,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict gum,
                                  const int stride,
                                  const __global int *dev_ij,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  __global int *restrict err_flag,
                                  const int eflag, const int vflag,
@ -721,7 +724,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
      lj3[tid]=lj3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -743,6 +746,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);

      int j=dev_ij[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@ -32,7 +32,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@ -48,7 +48,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -67,6 +67,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -119,7 +120,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@ -137,7 +138,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
    coeff[tid]=coeff_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -159,6 +160,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -57,7 +57,7 @@ _texture( sw3_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -116,7 +116,7 @@ _texture( sw3_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -194,7 +194,7 @@ _texture( sw3_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -265,7 +265,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                   const __global numtyp4 * restrict c_14,
                   const __global numtyp2 * restrict c_56,
                   const int ntypes, const __global int * dev_nbor,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom,
@ -282,7 +282,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
  if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE];
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -461,7 +461,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                const __global numtyp2 *restrict sw_pre3,
                                const int ntypes,
                                const __global int * dev_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag,
                                const int inum,  const int nbor_pitch,
@ -480,7 +480,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -579,7 +579,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const __global numtyp2 *restrict sw_pre3,
                             const int ntypes, const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -598,7 +598,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -701,7 +701,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const __global numtyp2 *restrict sw_pre3,
                             const int ntypes, const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -720,7 +720,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -49,7 +49,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom,
@ -66,7 +66,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -87,6 +87,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -146,7 +147,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom,
@ -165,7 +166,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -189,6 +190,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -251,7 +253,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom,
@ -268,7 +270,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -289,6 +291,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -352,7 +355,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -371,7 +374,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -395,6 +398,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -461,7 +465,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom,
@ -478,7 +482,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -499,6 +503,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -569,7 +574,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
                                  const __global numtyp* sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *ans,
+                                  __global acctyp3 *ans,
                                  __global acctyp *engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -588,7 +593,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -611,6 +616,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -686,7 +692,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
                             const __global numtyp* sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                             __global acctyp4 *ans,
+                             __global acctyp3 *ans,
                             __global acctyp *engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom,
@ -703,7 +709,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -724,6 +730,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -792,7 +799,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
                                  const __global numtyp* sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                  __global acctyp4 *ans,
+                                  __global acctyp3 *ans,
                                  __global acctyp *engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -811,7 +818,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
    cutsq[tid]=cutsq_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -835,6 +842,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -132,7 +132,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -211,7 +211,7 @@ _texture_2d( pos_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -448,7 +448,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
                                  const __global int *restrict elem2param,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -472,7 +472,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
  const numtyp ijparam_bigd = ts2_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -553,7 +553,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                     const __global acctyp2 *restrict zetaij,
                                     const __global acctyp *restrict zetaij_e,
                                     const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum,  const int nbor_pitch,
@ -585,7 +585,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
  const numtyp gamma = ts4_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -728,7 +728,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp *restrict zetaij_e,
                                  const __global int * dev_nbor,
                                  const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -760,7 +760,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
  const numtyp gamma = ts4_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -950,7 +950,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                    const __global acctyp *restrict zetaij_e,
                                    const __global int * dev_nbor,
                                    const __global int * dev_ilist,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum,  const int nbor_pitch,
@ -982,7 +982,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
  const numtyp gamma = ts4_in[ONETYPE3].w;
  #endif

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -131,7 +131,7 @@ _texture_2d( pos_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -209,7 +209,7 @@ _texture_2d( pos_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -417,7 +417,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                  const __global int *restrict elem2param,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -434,7 +434,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
    ts2[tid]=ts2_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -511,7 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                     const int nelements, const int nparams,
                                     const __global acctyp4 *restrict zetaij,
                                     const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum,  const int nbor_pitch,
@ -535,7 +535,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
    ts5[tid]=ts5_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -676,7 +676,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -700,7 +700,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
    ts5[tid]=ts5_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -890,7 +890,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_ilist,
-                                        __global acctyp4 *restrict ans,
+                                        __global acctyp3 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
@ -914,7 +914,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
    ts5[tid]=ts5_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -81,7 +81,7 @@ _texture( ts6_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -149,7 +149,7 @@ _texture( ts6_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -227,7 +227,7 @@ _texture( ts6_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -443,7 +443,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                  const __global int *restrict elem2param,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@ -462,7 +462,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
    ts6[tid]=ts6_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -544,7 +544,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                     const int nelements, const int nparams,
                                     const __global acctyp4 *restrict zetaij,
                                     const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum,  const int nbor_pitch,
@ -566,7 +566,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
    ts4[tid]=ts4_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -703,7 +703,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                  const __global acctyp4 *restrict zetaij,
                                  const __global int * dev_nbor,
                                  const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@ -725,7 +725,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
    ts4[tid]=ts4_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -908,7 +908,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global acctyp4 *restrict zetaij,
                                        const __global int * dev_nbor,
                                        const __global int * dev_ilist,
-                                        __global acctyp4 *restrict ans,
+                                        __global acctyp3 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
                                        const int inum,  const int nbor_pitch,
@ -930,7 +930,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
    ts4[tid]=ts4_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@ -33,7 +33,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
                   const __global numtyp *restrict sp_lj,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom) {
@ -43,7 +43,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -61,6 +61,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -109,7 +110,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int * dev_nbor,
                        const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@ -130,7 +131,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
      uf3[tid]=uf3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -151,6 +152,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -73,7 +73,7 @@ _texture( param5_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -132,7 +132,7 @@ _texture( param5_tex,int4);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -210,7 +210,7 @@ _texture( param5_tex,int4);
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
  if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
@ -247,6 +247,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;

    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      ucl_prefetch(dev_packed+nbor+nbor_pitch);
      int sj=dev_packed[nbor];
      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -283,7 +284,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                   const __global int *restrict elem2param,
                   const int nelements,
                   const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int ev_stride) {
@ -291,7 +292,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,

  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -313,6 +314,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
    itype=map[itype];

    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      ucl_prefetch(dev_packed+nbor+nbor_pitch);

      int j=dev_packed[nbor];
      j &= NEIGHMASK;
@ -489,7 +491,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                const __global int *restrict elem2param,
                                const int nelements,
                                const __global int * dev_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag,
                                const int inum,  const int nbor_pitch,
@ -504,7 +506,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,

  local_allocate_store_three();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -612,7 +614,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -627,7 +629,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,

  local_allocate_store_three();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -743,7 +745,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                             const int nelements,
                             const __global int * dev_nbor,
                             const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum,  const int nbor_pitch,
@ -758,7 +760,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,

  local_allocate_store_three();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -30,7 +30,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
@ -46,7 +46,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -65,6 +65,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -118,7 +119,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
@ -136,7 +137,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
    coeff[tid]=coeff_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -158,6 +159,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -40,7 +40,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom,
@ -57,7 +57,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -77,6 +77,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
@ -131,7 +132,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
                                    const __global numtyp *restrict sp_lj_in,
                                    const __global int *dev_nbor,
                                    const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
                                    const int eflag, const int vflag,
                                    const int inum, const int nbor_pitch,
@ -150,7 +151,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
    coeff[tid]=coeff_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -173,6 +174,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@ -88,7 +88,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
                    const int lj_types,
                    const __global int *dev_nbor,
                    const __global int *dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
@ -98,7 +98,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
  int n_stride;
  local_allocate_store_pair();

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -116,6 +116,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
    int itype=ix.w;

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      j &= NEIGHMASK;
@ -179,7 +180,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
                         const numtyp cut_inner,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@ -198,7 +199,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
    coeff3[tid]=coeff3_in[tid];
  }

-  acctyp4 f;
+  acctyp3 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, virial[6];
  if (EVFLAG) {
@ -219,6 +220,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);

      int j=dev_packed[nbor];
      j &= NEIGHMASK;
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -290,6 +290,20 @@ void FixGPU::init()

 void FixGPU::setup(int vflag)
 {
+  // See if we should overlap topology list builds on CPU with work on GPU
+  int overlap_topo = 0;
+  if ((atom->molecular != Atom::ATOMIC)) {
+    PairHybrid *ph = reinterpret_cast<PairHybrid *>(force->pair_match("^hybrid",0));
+    if (ph) {
+      for (int isub=0; isub < ph->nstyles; ++isub) {
+        if (force->pair_match("gpu",0,isub)) overlap_topo = 1;
+      }
+    } else {
+      if (force->pair_match("gpu",0)) overlap_topo = 1;
+    }
+  }
+  if (overlap_topo) neighbor->set_overlap_topo(1);
+
  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
    if (neighbor->exclude_setting() != 0)
      error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds");
--- a/src/GPU/fix_nve_asphere_gpu.cpp
+++ b/src/GPU/fix_nve_asphere_gpu.cpp
@ -243,13 +243,8 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
    // update angular momentum by 1/2 step
    if (igroup == 0) {
      #if (LAL_USE_OMP_SIMD == 1)
-        // Workaround for compiler bug
-        #ifdef __INTEL_COMPILER
-        #pragma simd
-        #else
      #pragma omp simd
      #endif
-      #endif
      for (int i = ifrom; i < ito; i++) {
        double *quat = bonus[ellipsoid[i]].quat;
        ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
@ -257,13 +252,8 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
      }
    } else {
      #if (LAL_USE_OMP_SIMD == 1)
-        // Workaround for compiler bug
-        #ifdef __INTEL_COMPILER
-        #pragma simd
-        #else
      #pragma omp simd
      #endif
-      #endif
      for (int i = ifrom; i < ito; i++) {
        if (mask[i] & groupbit) {
          double *quat = bonus[ellipsoid[i]].quat;
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -155,6 +155,15 @@ PairAmoebaGPU::~PairAmoebaGPU()
  amoeba_gpu_clear();
 }

+/* ---------------------------------------------------------------------- */
+
+void PairAmoebaGPU::compute(int eflag, int vflag)
+{
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
+  PairAmoeba::compute(eflag, vflag);
+}
+
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -28,6 +28,7 @@ class PairAmoebaGPU : public PairAmoeba {
 public:
  PairAmoebaGPU(LAMMPS *lmp);
  ~PairAmoebaGPU() override;
+  void compute(int, int) override;
  void init_style() override;
  double memory_usage() override;

--- a/src/GPU/pair_beck_gpu.cpp
+++ b/src/GPU/pair_beck_gpu.cpp
@ -109,6 +109,8 @@ void PairBeckGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_born_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_cs_gpu.cpp
@ -129,6 +129,8 @@ void PairBornCoulLongCSGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_born_coul_long_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_gpu.cpp
@ -123,6 +123,8 @@ void PairBornCoulLongGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
@ -117,6 +117,8 @@ void PairBornCoulWolfCSGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_born_coul_wolf_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_gpu.cpp
@ -114,6 +114,8 @@ void PairBornCoulWolfGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_born_gpu.cpp
+++ b/src/GPU/pair_born_gpu.cpp
@ -109,6 +109,8 @@ void PairBornGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_buck_coul_cut_gpu.cpp
+++ b/src/GPU/pair_buck_coul_cut_gpu.cpp
@ -111,6 +111,8 @@ void PairBuckCoulCutGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_buck_coul_long_gpu.cpp
+++ b/src/GPU/pair_buck_coul_long_gpu.cpp
@ -120,6 +120,8 @@ void PairBuckCoulLongGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_buck_gpu.cpp
+++ b/src/GPU/pair_buck_gpu.cpp
@ -107,6 +107,8 @@ void PairBuckGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_colloid_gpu.cpp
+++ b/src/GPU/pair_colloid_gpu.cpp
@ -109,6 +109,8 @@ void PairColloidGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_coul_cut_gpu.cpp
+++ b/src/GPU/pair_coul_cut_gpu.cpp
@ -108,6 +108,8 @@ void PairCoulCutGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_coul_debye_gpu.cpp
+++ b/src/GPU/pair_coul_debye_gpu.cpp
@ -109,6 +109,8 @@ void PairCoulDebyeGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_coul_dsf_gpu.cpp
@ -118,6 +118,8 @@ void PairCoulDSFGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/src/GPU/pair_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_coul_long_cs_gpu.cpp
@ -123,6 +123,8 @@ void PairCoulLongCSGPU::compute(int eflag, int vflag)
  }
  if (!success) error->one(FLERR, "Insufficient memory on accelerator");

+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
  if (host_start < inum) {
    cpu_time = platform::walltime();
    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
--- a/Show More
+++ b/Show More