diff --git a/doc/src/package.rst b/doc/src/package.rst
index 0ced387539..76bf20a97f 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -319,7 +319,7 @@ CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
 THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
 BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
 BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
-PPPM_MAX_SPLINE.
+PPPM_MAX_SPLINE, NBOR_PREFETCH.
 
 CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
 (NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
diff --git a/lib/gpu/Makefile.oneapi b/lib/gpu/Makefile.oneapi
index 9d11a0c4b0..32800676aa 100644
--- a/lib/gpu/Makefile.oneapi
+++ b/lib/gpu/Makefile.oneapi
@@ -12,13 +12,12 @@ EXTRAMAKE = Makefile.lammps.opencl
 LMP_INC = -DLAMMPS_SMALLBIG
 
 OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
-CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \
-          -qoverride-limits
-OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
+OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
           $(LMP_INC) $(OCL_INC) $(CPP_OPT)
 OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DGERYON_NO_OCL_MARKERS
 
 BIN_DIR = ./
 OBJ_DIR = ./
diff --git a/lib/gpu/Makefile.oneapi_prof b/lib/gpu/Makefile.oneapi_prof
new file mode 100644
index 0000000000..1e21597373
--- /dev/null
+++ b/lib/gpu/Makefile.oneapi_prof
@@ -0,0 +1,28 @@
+# /* ----------------------------------------------------------------------   
+#  Linux Makefile for Intel oneAPI - Mixed precision (with timing enabled)
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.opencl
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/
+CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math
+OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \
+          $(LMP_INC) $(OCL_INC) $(CPP_OPT)
+OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL
+OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
diff --git a/lib/gpu/README b/lib/gpu/README
index 51b21960ae..b720aa65cb 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -266,6 +266,7 @@ LAL_SERIALIZE_INIT      Force serialization of initialization and compilation
                         for multiple MPI tasks sharing the same accelerator.
                         Some accelerator API implementations have had issues
                         with temporary file conflicts in the past.
+LAL_DISABLE_PREFETCH    Disable prefetch in kernels
 GERYON_FORCE_SHARED_MAIN_MEM_ON      Should only be used for builds where the
                                      accelerator is guaranteed to share physical
                                      main memory with the host (e.g. integrated
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 1b2e5b8c77..e63a1f56b2 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -429,7 +429,7 @@ void UCL_Device::clear() {
     CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context));
     CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device));
 #else
-    cuCtxDestroy(_context));
+    cuCtxDestroy(_context);
 #endif
   }
   _device=-1;
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index f572d3ebd0..82a42cff6c 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -113,7 +113,7 @@ _texture( q_tex,int2);
     dufld[5]=red_acc[5][tid];                                               \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
     t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
       (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
     t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@@ -147,7 +147,7 @@ _texture( q_tex,int2);
     _fieldp[5]=red_acc[5][tid];                                             \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
     f.x = _fieldp[0];                                                       \
     f.y = _fieldp[1];                                                       \
     f.z = _fieldp[2];                                                       \
@@ -174,7 +174,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -254,7 +254,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
     t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
       (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
     t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@@ -277,7 +277,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
     f.x = _fieldp[0];                                                       \
     f.y = _fieldp[1];                                                       \
     f.z = _fieldp[2];                                                       \
@@ -302,7 +302,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -391,7 +391,7 @@ _texture( q_tex,int2);
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -416,9 +416,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
-                                 __global acctyp4 *restrict tep,
+                                 __global acctyp3 *restrict tep,
                                  const int eflag, const int vflag, const int inum,
                                  const int nall, const int nbor_pitch,
                                  const int t_per_atom, const numtyp aewald,
@@ -431,7 +431,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_charge();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -440,9 +440,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     for (int l=0; l<6; l++) virial[l]=(acctyp)0;
   }
 
-  acctyp4 tq;
+  acctyp3 tq;
   tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
-  
+
   const __global numtyp4* polar1 = &extra[0];
   const __global numtyp4* polar2 = &extra[nall];
   const __global numtyp4* polar3 = &extra[2*nall];
@@ -695,7 +695,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
+                                 __global acctyp3 *restrict fieldp,
                                  const int inum,  const int nall,
                                  const int nbor_pitch, const int t_per_atom,
                                  const numtyp aewald, const numtyp off2,
@@ -889,7 +889,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
+                                 __global acctyp3 *restrict fieldp,
                                  const int inum,  const int nall,
                                  const int nbor_pitch, const int t_per_atom,
                                  const numtyp aewald, const numtyp off2,
@@ -1052,9 +1052,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
                              const __global int *dev_short_nbor,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
-                             __global acctyp4 *restrict tep,
+                             __global acctyp3 *restrict tep,
                              const int eflag, const int vflag, const int inum,
                              const int nall, const int nbor_pitch, const int t_per_atom,
                              const numtyp aewald, const numtyp felec,
@@ -1067,7 +1067,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_charge();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -1082,7 +1082,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
 
   numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  
+
   const __global numtyp4* polar1 = &extra[0];
   const __global numtyp4* polar2 = &extra[nall];
   const __global numtyp4* polar3 = &extra[2*nall];
@@ -1226,7 +1226,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp prc3[3],prc5[3],prc7[3];
       numtyp drc3[3],drc5[3],drc7[3];
       numtyp urc3[3],urc5[3];
-    
+
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[5];
@@ -1583,12 +1583,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
   if (ii<inum) {
 
     const int nlpts = (bsorder-1) / 2;
-    
+
     int istart = fast_mul(ii,4);
     const int igridx = igrid[istart];
     const int igridy = igrid[istart+1];
     const int igridz = igrid[istart+2];
-    
+
     // now istart is used to index thetai1, thetai2 and thetai3
     istart = fast_mul(ii,bsorder);
 
@@ -1782,7 +1782,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
     fdip_buf[7] = tuv110_1;
     fdip_buf[8] = tuv101_1;
     fdip_buf[9] = tuv011_1;
-    idx = ii;    
+    idx = ii;
     for (int m = 0; m < 10; m++) {
       fdip_phi1[idx] = fdip_buf[m];
       idx += inum;
@@ -1798,7 +1798,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
     fdip_buf[7] = tuv110_2;
     fdip_buf[8] = tuv101_2;
     fdip_buf[9] = tuv011_2;
-    idx = ii;    
+    idx = ii;
     for (int m = 0; m < 10; m++) {
       fdip_phi2[idx] = fdip_buf[m];
       idx += inum;
@@ -1824,7 +1824,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
     fdip_buf[17] = tuv102;
     fdip_buf[18] = tuv012;
     fdip_buf[19] = tuv111;
-    idx = ii;    
+    idx = ii;
     for (int m = 0; m < 20; m++) {
       fdip_sum_phi[idx] = fdip_buf[m];
       idx += inum;
@@ -1855,12 +1855,12 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
   if (ii<inum) {
 
     int nlpts = (bsorder-1) / 2;
-    
+
     int istart = fast_mul(ii,4);
     int igridx = igrid[istart];
     int igridy = igrid[istart+1];
     int igridz = igrid[istart+2];
-    
+
     // now istart is used to index thetai1, thetai2 and thetai3
     istart = fast_mul(ii,bsorder);
 
@@ -1990,7 +1990,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
     buf[18] = tuv012;
     buf[19] = tuv111;
 
-    int idx = ii;    
+    int idx = ii;
     for (int m = 0; m < 20; m++) {
       fphi[idx] = felec * buf[m];
       idx += inum;
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index 361c340ec7..1911be8431 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
 
 template <class numtyp, class acctyp>
 int AnswerT::bytes_per_atom() const {
-  int bytes=11*sizeof(acctyp);
+  int bytes=10*sizeof(acctyp);
   if (_rot)
-    bytes+=4*sizeof(acctyp);
+    bytes+=3*sizeof(acctyp);
   if (_charge)
     bytes+=sizeof(acctyp);
   return bytes;
@@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {
 
   bool success=true;
 
-  _ans_fields=4;
+  _ans_fields=3;
   if (_rot)
-    _ans_fields+=4;
+    _ans_fields+=3;
 
   // ---------------------------  Device allocations
   success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
@@ -134,11 +134,11 @@ void AnswerT::clear() {
 
 template <class numtyp, class acctyp>
 double AnswerT::host_memory_usage() const {
-  int atom_bytes=4;
+  int atom_bytes=3;
   if (_charge)
     atom_bytes+=1;
   if (_rot)
-    atom_bytes+=4;
+    atom_bytes+=3;
   int ans_bytes=atom_bytes+_ev_fields;
   return ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(Answer<numtyp,acctyp>);
@@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
   if (csize>0)
     engv.update_host(_ev_stride*csize,true);
   if (_rot)
-    force.update_host(_inum*4*2,true);
+    force.update_host(_inum*3*2,true);
   else
-    force.update_host(_inum*4,true);
+    force.update_host(_inum*3,true);
   time_answer.stop();
 
   #ifndef GERYON_OCL_FLUSH
@@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
   if (_ilist==nullptr) {
-    typedef struct { double x,y,z; } vec3d;
-    typedef struct { acctyp x,y,z,w; } vec4d_t;
-    auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
-    auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
+    auto fp=reinterpret_cast<double*>(&(f[0][0]));
 
     #if (LAL_USE_OMP == 1)
     #pragma omp parallel
@@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
       #if (LAL_USE_OMP == 1)
       const int nthreads = omp_get_num_threads();
       const int tid = omp_get_thread_num();
-      const int idelta = _inum / nthreads + 1;
+      const int idelta = _inum*3 / nthreads + 1;
       const int ifrom = tid * idelta;
-      const int ito = std::min(ifrom + idelta, _inum);
+      const int ito = std::min(ifrom + idelta, _inum*3);
       #else
       const int ifrom = 0;
-      const int ito = _inum;
+      const int ito = _inum*3;
       #endif
 
-      for (int i=ifrom; i<ito; i++) {
-        fp[i].x+=forcep[i].x;
-        fp[i].y+=forcep[i].y;
-        fp[i].z+=forcep[i].z;
-      }
+      for (int i=ifrom; i<ito; i++)
+        fp[i]+=force[i];
       if (_rot) {
-        auto torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
-        auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
-        for (int i=ifrom; i<ito; i++) {
-          torp[i].x+=torquep[i].x;
-          torp[i].y+=torquep[i].y;
-          torp[i].z+=torquep[i].z;
-        }
+        auto torp=reinterpret_cast<double*>(&(tor[0][0]));
+        auto torquep=&(force[_inum*3]);
+        for (int i=ifrom; i<ito; i++)
+          torp[i]+=torquep[i];
       }
     }
   } else {
@@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
       const int idelta = _inum / nthreads + 1;
       const int ifrom = tid * idelta;
       const int ito = std::min(ifrom + idelta, _inum);
-      int fl=ifrom*4;
+      int fl=ifrom*3;
       #else
       const int ifrom = 0;
       const int ito = _inum;
@@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
         f[ii][0]+=force[fl];
         f[ii][1]+=force[fl+1];
         f[ii][2]+=force[fl+2];
-        fl+=4;
+        fl+=3;
       }
       if (_rot) {
-        fl=_inum*4 + ifrom*4;
+        fl=_inum*3 + ifrom*3;
         for (int i=ifrom; i<ito; i++) {
           int ii=_ilist[i];
           tor[ii][0]+=force[fl];
           tor[ii][1]+=force[fl+1];
           tor[ii][2]+=force[fl+2];
-          fl+=4;
+          fl+=3;
         }
       }
     }
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 3d1a1cc963..aa490052cc 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -114,7 +114,7 @@ bool AtomT::alloc(const int nall) {
                                 UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=q.device.row_bytes();
   }
-  if (_rot && !_host_view) {
+  if (_rot) {
     success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=quat.device.row_bytes();
@@ -182,11 +182,9 @@ bool AtomT::add_fields(const bool charge, const bool rot,
   if (rot && !_rot) {
     _rot=true;
     _other=true;
-    if (!_host_view) {
-      success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
-                                     UCL_READ_ONLY)==UCL_SUCCESS);
-      gpu_bytes+=quat.device.row_bytes();
-    }
+    success=success && (quat.alloc(_max_atoms*4,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=quat.device.row_bytes();
   }
 
   if (vel && !_vel) {
@@ -451,7 +449,7 @@ template <class numtyp, class acctyp>
 void AtomT::compile_kernels(UCL_Device &dev) {
   std::string flags = "";
   atom_program=new UCL_Program(dev);
-  atom_program->load_string(atom,flags,nullptr,screen);
+  atom_program->load_string(atom,flags.c_str(),nullptr,stderr);
   k_cast_x.set_function(*atom_program,"kernel_cast_x");
   _compiled=true;
 }
diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu
index 287d72803c..1418459301 100644
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@@ -18,7 +18,7 @@
 #endif
 
 __kernel void kernel_cast_x(__global numtyp4 *restrict x_type,
-                            const __global numtyp *restrict x,
+                            const __global double *restrict x,
                             const __global int *restrict type,
                             const int nall) {
   int ii=GLOBAL_ID_X;
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 771c2a3571..081a1ae048 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -52,6 +52,12 @@ using namespace ucl_cudadr;
 
 namespace LAMMPS_AL {
 
+struct EllipsoidBonus {
+  double shape[3];
+  double quat[4];
+  int ilocal;
+};
+
 template <class numtyp, class acctyp>
 class Atom {
  public:
@@ -306,8 +312,8 @@ class Atom {
     if (_x_avail==false) {
       double t=MPI_Wtime();
       #ifdef GPU_CAST
-      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int));
       #else
       vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
       vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
@@ -351,6 +357,24 @@ class Atom {
     add_x_data(host_ptr,host_type);
   }
 
+  // Cast mu data to write buffer (stored in quat)
+  template<class cpytyp>
+  inline void cast_mu_data(cpytyp *host_ptr) {
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (sizeof(numtyp)==sizeof(double))
+        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
+      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
+        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
   // Cast charges to write buffer
   template<class cpytyp>
   inline void cast_q_data(cpytyp *host_ptr) {
@@ -384,22 +408,24 @@ class Atom {
   }
 
   // Cast quaternions to write buffer
-  template<class cpytyp>
-  inline void cast_quat_data(cpytyp *host_ptr) {
+  inline void cast_quat_data(const int *ellipsoid,
+                             const EllipsoidBonus *bonus) {
     if (_quat_avail==false) {
       double t=MPI_Wtime();
-      if (_host_view) {
-        quat.host.view((numtyp*)host_ptr,_nall*4,*dev);
-        quat.device.view(quat.host);
-      } else if (sizeof(numtyp)==sizeof(double))
-        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
-      else
-        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
-        #pragma omp parallel for simd schedule(static)
-        #elif (LAL_USE_OMP_SIMD == 1)
-        #pragma omp simd
-        #endif
-        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
+      #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+      #pragma omp parallel for simd schedule(static)
+      #elif (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i=0; i<_nall; i++) {
+        int qi = ellipsoid[i];
+        if (qi > -1) {
+          quat[i*4] = bonus[qi].quat[0];
+          quat[i*4+1] = bonus[qi].quat[1];
+          quat[i*4+2] = bonus[qi].quat[2];
+          quat[i*4+3] = bonus[qi].quat[3];
+        }
+      }
       _time_cast+=MPI_Wtime()-t;
     }
   }
@@ -419,10 +445,6 @@ class Atom {
   inline void cast_v_data(double **host_ptr, const tagint *host_tag) {
     if (_v_avail==false) {
       double t=MPI_Wtime();
-      #ifdef GPU_CAST
-      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
-      #else
       vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
       vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
       #if (LAL_USE_OMP == 1)
@@ -434,7 +456,6 @@ class Atom {
         vp[i].z=host_p[i].z;
         vp[i].w=host_tag[i];
       }
-      #endif
       _time_cast+=MPI_Wtime()-t;
     }
   }
@@ -444,16 +465,7 @@ class Atom {
   inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
     time_vel.start();
     if (_v_avail==false) {
-      #ifdef GPU_CAST
-      v_cast.update_device(_nall*3,true);
-      tag_cast.update_device(_nall,true);
-      int block_size=64;
-      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-      k_cast_x.set_size(GX,block_size);
-      k_cast_x.run(&v, &v_cast, &tag_cast, &_nall);
-      #else
       v.update_device(_nall*4,true);
-      #endif
       _v_avail=true;
     }
     time_vel.stop();
@@ -519,7 +531,7 @@ class Atom {
   UCL_Vector<numtyp4,numtyp4> extra;
 
   #ifdef GPU_CAST
-  UCL_Vector<numtyp,numtyp> x_cast;
+  UCL_Vector<double,double> x_cast;
   UCL_Vector<int,int> type_cast;
   #endif
 
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 09d7386461..0821a33b06 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
 
   _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
   _max_fieldp_size = _max_tep_size;
-  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
   _max_thetai_size = 0;
 
@@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
 
   if (inum_full>_max_tep_size) {
     _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _tep.resize(_max_tep_size*4);
+    _tep.resize(_max_tep_size*3);
   }
   *tep_ptr=_tep.host.begin();
 
@@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
 
   // copy tep from device to host
 
-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }
 
 // ---------------------------------------------------------------------------
@@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
-  _fieldp.update_host(_max_fieldp_size*8,false);
+  _fieldp.update_host(_max_fieldp_size*6,false);
 }
 
 // ---------------------------------------------------------------------------
@@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   // NOTE: move this step to update_fieldp() to delay device-host transfer
   //       after umutual1 and self are done on the GPU
   // *fieldp_ptr=_fieldp.host.begin();
-  // _fieldp.update_host(_max_fieldp_size*8,false);
+  // _fieldp.update_host(_max_fieldp_size*6,false);
 }
 
 // ---------------------------------------------------------------------------
@@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
   device->add_ans_object(ans);
 
   // copy tep from device to host
-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 6ef1c40ca7..7f09e100f1 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -233,7 +233,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
 
   atom->cast_x_data(host_x,host_type);
   atom->cast_q_data(host_q);
-  atom->cast_quat_data(host_mu[0]);
+  atom->cast_mu_data(host_mu[0]);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
   atom->add_q_data();
@@ -297,12 +297,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
     if (!success)
       return nullptr;
     atom->cast_q_data(host_q);
-    atom->cast_quat_data(host_mu[0]);
+    atom->cast_mu_data(host_mu[0]);
     hd_balancer.start_timer();
   } else {
     atom->cast_x_data(host_x,host_type);
     atom->cast_q_data(host_q);
-    atom->cast_quat_data(host_mu[0]);
+    atom->cast_mu_data(host_mu[0]);
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 0bc20615a1..bc383de18f 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -375,7 +375,8 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
                              const bool eflag_in, const bool vflag_in,
                              const bool eatom, const bool vatom,
                              int &host_start, const double cpu_time,
-                             bool &success, double **host_quat) {
+                             bool &success, const int *ellipsoid,
+                             const EllipsoidBonus *bonus) {
   acc_timers();
   int eflag, vflag;
   if (eflag_in) eflag=2;
@@ -409,7 +410,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
     list=ilist;
 
   atom->cast_x_data(host_x,host_type);
-  atom->cast_quat_data(host_quat[0]);
+  atom->cast_quat_data(ellipsoid,bonus);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
   atom->add_quat_data();
@@ -433,7 +434,8 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
                               const bool eatom, const bool vatom,
                               int &host_start, int **ilist, int **jnum,
                               const double cpu_time, bool &success,
-                              double **host_quat) {
+                              const int *ellipsoid,
+                              const EllipsoidBonus *bonus) {
   acc_timers();
   int eflag, vflag;
   if (eflag_in) eflag=2;
@@ -460,11 +462,11 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return nullptr;
-    atom->cast_quat_data(host_quat[0]);
+    atom->cast_quat_data(ellipsoid,bonus);
     hd_balancer.start_timer();
   } else {
     atom->cast_x_data(host_x,host_type);
-    atom->cast_quat_data(host_quat[0]);
+    atom->cast_quat_data(ellipsoid,bonus);
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h
index 9885e931ee..618f97da54 100644
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@@ -170,7 +170,8 @@ class BaseEllipsoid {
                double **host_x, int *host_type, int *ilist, int *numj,
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double **quat);
+               const double cpu_time, bool &success,
+               const int *ellipsoid, const EllipsoidBonus *bonus);
 
   /// Pair loop with device neighboring
   int**compute(const int ago, const int inum_full, const int nall,
@@ -179,7 +180,7 @@ class BaseEllipsoid {
                tagint **special, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-               double **host_quat);
+               const int *ellipsoid, const EllipsoidBonus *bonus);
 
   // -------------------------- DEVICE DATA -------------------------
 
diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu
index 12f1314c52..b0a9a6a4c1 100644
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@@ -31,7 +31,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@@ -47,7 +47,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -66,6 +66,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -130,7 +131,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@@ -150,7 +151,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
     beck2[tid]=beck2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -172,6 +173,7 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index 825175af8f..8d7e26217f 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -32,7 +32,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@@ -48,7 +48,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -67,6 +67,7 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -123,7 +124,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@@ -144,7 +145,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -166,6 +167,7 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index d38a101c30..8bb0e69182 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -36,7 +36,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -60,7 +60,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -80,6 +80,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -158,7 +159,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -183,7 +184,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -206,6 +207,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_born_coul_long_cs.cu b/lib/gpu/lal_born_coul_long_cs.cu
index 077ec2f74f..0ed395fa0e 100644
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@@ -51,7 +51,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -75,7 +75,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -95,6 +95,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -192,7 +193,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -217,7 +218,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -240,6 +241,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index aefcac8127..f1ce04e784 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -38,7 +38,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -63,7 +63,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -89,6 +89,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -174,7 +175,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -200,7 +201,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -229,6 +230,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_born_coul_wolf_cs.cu b/lib/gpu/lal_born_coul_wolf_cs.cu
index 866d256f33..785bbed49b 100644
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@@ -39,7 +39,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -64,7 +64,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -90,6 +90,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -176,7 +177,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -202,7 +203,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -231,6 +232,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 958c7bdd4d..7554a27869 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -31,7 +31,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag,  const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@@ -47,7 +47,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -66,6 +66,7 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -120,7 +121,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@@ -141,7 +142,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -163,6 +164,7 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 2aaa9c9b3d..e7b6dd5e98 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -36,7 +36,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -59,7 +59,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -79,6 +79,7 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -151,7 +152,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -177,7 +178,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -200,6 +201,7 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index f5ce3a7d11..6f50d3ea2a 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -36,7 +36,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -60,7 +60,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -80,6 +80,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -159,7 +160,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
                                     const __global numtyp *restrict sp_lj_in,
                                     const __global int *dev_nbor,
                                     const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum, const int nbor_pitch,
@@ -185,7 +186,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -208,6 +209,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_charmm.cu b/lib/gpu/lal_charmm.cu
index 589d9adc91..f05ef10e1c 100644
--- a/lib/gpu/lal_charmm.cu
+++ b/lib/gpu/lal_charmm.cu
@@ -34,7 +34,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag,
                        const int inum, const int nbor_pitch,
@@ -53,7 +53,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_bio();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -73,6 +73,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -159,7 +160,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
                             const int inum, const int nbor_pitch,
@@ -187,7 +188,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
   if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
     ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -209,6 +210,7 @@ __kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index 77793d0e83..f34654cfa0 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -35,7 +35,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@@ -50,7 +50,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_bio();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -70,6 +70,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -156,7 +157,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@@ -181,7 +182,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
   if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
     ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -203,6 +204,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index f59215e882..49c190cfc2 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -34,7 +34,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
                         const __global int *form,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@@ -50,7 +50,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -69,6 +69,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -188,7 +189,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
                              const __global int *form_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
@@ -215,7 +216,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -237,6 +238,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu
index c4da81a3a2..60be4216cf 100644
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@@ -35,7 +35,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_cl_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch,
@@ -54,7 +54,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -74,6 +74,7 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
 
     numtyp factor_coul;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_coul = sp_cl[sbmask(j)];
@@ -125,7 +126,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_cl_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -146,7 +147,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
     cutsq[tid]=_cutsq[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -169,6 +170,7 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_coul;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_coul = sp_cl[sbmask(j)];
diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu
index ba922f04a6..ba89322910 100644
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@@ -35,7 +35,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_cl_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch,
@@ -55,7 +55,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -75,6 +75,7 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
 
     numtyp factor_coul;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_coul = sp_cl[sbmask(j)];
@@ -129,7 +130,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_cl_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag, const int inum,
                                 const int nbor_pitch,
@@ -153,7 +154,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
     cutsq[tid]=_cutsq[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -176,6 +177,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_coul;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_coul = sp_cl[sbmask(j)];
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index 5241cb5097..17b28ef9bf 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -36,7 +36,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@@ -56,7 +56,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -81,6 +81,7 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_coul, r, prefactor, erfcc;
@@ -138,7 +139,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@@ -156,7 +157,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -183,6 +184,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_coul, r, prefactor, erfcc;
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index f8a33e90a2..ca0ed0eab9 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -35,7 +35,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_cl_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -54,7 +54,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp e_coul, virial[6];
   if (EVFLAG) {
@@ -73,6 +73,7 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_coul;
@@ -132,7 +133,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_cl_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -152,7 +153,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
     scale[tid]=scale_in[tid];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp e_coul, virial[6];
   if (EVFLAG) {
@@ -174,6 +175,7 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_coul;
diff --git a/lib/gpu/lal_coul_long_cs.cu b/lib/gpu/lal_coul_long_cs.cu
index dfbc771adc..5a1e59e407 100644
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@@ -49,7 +49,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_cl_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -68,7 +68,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp e_coul, virial[6];
   if (EVFLAG) {
@@ -87,6 +87,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_coul;
@@ -166,7 +167,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_cl_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -186,7 +187,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
     scale[tid]=scale_in[tid];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp e_coul, virial[6];
   if (EVFLAG) {
@@ -208,6 +209,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_coul;
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 891d67913e..af53572590 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -370,7 +370,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
 
   _ocl_config_name="CUSTOM";
   int token_count=0;
-  std::string params[18];
+  std::string params[19];
   char ocl_config[2048];
   strncpy(ocl_config,s_config.c_str(),2047);
   char *pch = strtok(ocl_config,",");
@@ -378,7 +378,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
   pch = strtok(nullptr,",");
   if (pch == nullptr) return -11;
   while (pch != nullptr) {
-    if (token_count==18)
+    if (token_count==19)
       return -11;
     params[token_count]=pch;
     token_count++;
@@ -389,6 +389,16 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
   #ifdef CL_VERSION_2_0
   _ocl_compile_string+="-cl-std=CL2.0 ";
   #endif
+  if (params[0]=="500") {
+    _ocl_compile_string+="-DINTEL_OCL ";
+    #ifdef _DOUBLE_DOUBLE
+    // workaround for double precision with Intel OpenCL
+    params[4]="0";
+    #endif
+  }
+  #ifdef LAL_DISABLE_PREFETCH
+  params[18]="0";
+  #endif
   if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
   _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
     std::string(OCL_PRECISION_COMPILE);
@@ -421,7 +431,8 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
 
                          " -DMAX_SHARED_TYPES="+params[15]+
                          " -DMAX_BIO_SHARED_TYPES="+params[16]+
-                         " -DPPPM_MAX_SPLINE="+params[17];
+                         " -DPPPM_MAX_SPLINE="+params[17]+
+                         " -DNBOR_PREFETCH="+params[18];
   _ocl_compile_string += extra_args;
   #endif
   return 0;
@@ -558,7 +569,11 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
     return -3;
 
   if (_user_cell_size<0.0) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
+    #else
     _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
+    #endif
   } else
     _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
   nbor->set_cutoff(cutoff);
@@ -954,7 +969,7 @@ int DeviceT::compile_kernels() {
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
 
-  UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED);
+  UCL_Vector<int,int> gpu_lib_data(20,*gpu,UCL_NOT_PINNED);
   k_info.set_size(1,1);
   k_info.run(&gpu_lib_data);
   gpu_lib_data.update_host(false);
diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu
index 61341964b2..073c7de3d9 100644
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@@ -52,4 +52,5 @@ __kernel void kernel_info(__global int *info) {
   info[16]=MAX_SHARED_TYPES;
   info[17]=MAX_BIO_SHARED_TYPES;
   info[18]=PPPM_MAX_SPLINE;
+  info[19]=NBOR_PREFETCH;
 }
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index cbe68ff692..18326edd3a 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -211,7 +211,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -235,7 +235,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
@@ -257,6 +257,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -282,8 +283,8 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
         numtyp rinv, r3inv, r5inv, r7inv;
         numtyp pre1, pre2, pre3, pre4;
         numtyp pdotp, pidotr, pjdotr;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;
 
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@@ -418,7 +419,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -445,7 +446,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
@@ -470,6 +471,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -494,8 +496,8 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
         numtyp rinv, r3inv, r5inv, r7inv;
         numtyp pre1, pre2, pre3, pre4;
         numtyp pdotp, pidotr, pjdotr;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;
 
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index 717d8959ba..13c874d516 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -212,7 +212,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@@ -236,7 +236,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
@@ -258,6 +258,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -286,8 +287,8 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
         numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
         numtyp4 aforcecoul, bforcecoul;
 
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;
 
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@@ -450,7 +451,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp *restrict sp_lj_in,
                                   const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
@@ -478,7 +479,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
@@ -503,6 +504,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -530,8 +532,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
         numtyp presf,afac,bfac,pqfac,qpfac,rcutlj2inv,rcutlj6inv,rcutcoul2inv;
         numtyp4 aforcecoul, bforcecoul;
 
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;
 
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
diff --git a/lib/gpu/lal_dipole_long_lj.cu b/lib/gpu/lal_dipole_long_lj.cu
index 407b63f93e..361dcd6d87 100644
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@@ -213,7 +213,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch,
@@ -238,7 +238,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
@@ -264,6 +264,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -291,8 +292,8 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
         numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
         numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
         numtyp fdx,fdy,fdz,fax,fay,faz;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;
 
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
@@ -462,7 +463,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch,
@@ -490,7 +491,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
@@ -519,6 +520,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -545,8 +547,8 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
         numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
         numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
         numtyp fdx,fdy,fdz,fax,fay,faz;
-        acctyp4 forcecoul, ticoul;
-        acctyp4 force;
+        acctyp3 forcecoul, ticoul;
+        acctyp3 force;
 
         forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
         ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu
index 0c861f51de..e9d3279378 100644
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@@ -168,7 +168,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_sqrt,
                     const __global int * dev_nbor,
                     const __global int * dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch,
@@ -183,7 +183,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -203,6 +203,7 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
 
     numtyp factor_dpd, factor_sqrt;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_dpd = sp_lj[sbmask(j)];
@@ -284,7 +285,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_sqrt_in,
                          const __global int * dev_nbor,
                          const __global int * dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@@ -318,7 +319,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -343,6 +344,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
     numtyp factor_dpd, factor_sqrt;
     #endif
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       #ifndef ONETYPE
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index dc518fb550..8c7a410379 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -246,6 +246,7 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
     tfrho=type2frho[itype];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -332,6 +333,7 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
     #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -376,7 +378,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
                     const __global numtyp *cutsq,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                    __global acctyp4 *ans,
+                    __global acctyp3 *ans,
                     __global acctyp *engv,
                     const int eflag, const int vflag,  const int inum,
                     const int nbor_pitch, const int ntypes,
@@ -388,7 +390,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_answers_eam();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -407,6 +409,7 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
@@ -487,7 +490,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
                          const __global numtyp *cutsq,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                         __global acctyp4 *ans,
+                         __global acctyp3 *ans,
                          __global acctyp *engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const numtyp cutforcesq,
@@ -510,7 +513,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
   int n_stride;
   local_allocate_store_answers_eam();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -532,6 +535,7 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
     #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index 1c549ab6a6..4a5ac04dea 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -152,7 +152,7 @@ _texture_2d( quat_tex,int4);
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -224,7 +224,7 @@ _texture_2d( quat_tex,int4);
         engv+=inum;                                                         \
       }                                                                     \
     }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index cb6b72db30..652f068fdc 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -30,7 +30,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
@@ -40,7 +40,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -59,6 +59,7 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -109,7 +110,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
@@ -127,7 +128,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
     gauss1[tid]=gauss1_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -149,6 +150,7 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index 9267dfd85d..50c1ddf9f9 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -80,6 +80,9 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
                     m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }
 
+#ifdef INTEL_OCL
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
 __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict q,
                          const __global numtyp4 *restrict shape,
@@ -90,7 +93,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict lshape,
                          const __global int *dev_nbor,
                          const int stride,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                          const int astride,
                          __global acctyp *restrict engv,
                          __global int *restrict err_flag,
@@ -108,7 +111,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, virial[6];
@@ -138,6 +141,7 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
       int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp
index 864da8e7ad..e25d945dca 100644
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@@ -108,28 +108,33 @@ int** compute(const int ago, const int inum_full, const int nall,
                 tagint **special, const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
+                const int *ellipsoid, const EllipsoidBonus *bonus);
 
 int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist,
-                       int **jnum, const double cpu_time, bool &success,
-                       double **host_quat) {
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus) {
   return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
                       tag, nspecial, special, eflag, vflag, eatom, vatom,
-                      host_start, ilist, jnum, cpu_time, success, host_quat);
+                      host_start, ilist, jnum, cpu_time, success,
+                      ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }
 
 int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
+                     const double cpu_time, bool &success,
+                     const int *ellipsoid, const void *bonus) {
   return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                       numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
-                      cpu_time, success, host_quat);
+                      cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index 4582f0d411..55b4eddb58 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -34,7 +34,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                           const __global numtyp *restrict lshape,
                                           const __global int *dev_nbor,
                                           const int stride,
-                                          __global acctyp4 *restrict ans,
+                                          __global acctyp3 *restrict ans,
                                           __global acctyp *restrict engv,
                                           __global int *restrict err_flag,
                                           const int eflag, const int vflag,
@@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -75,6 +75,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
 
       int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -259,7 +260,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict gum,
                             const int stride,
                             const __global int *dev_ij,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             __global int *restrict err_flag,
                             const int eflag, const int vflag, const int start,
@@ -277,7 +278,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -296,6 +297,7 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);
 
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -347,7 +349,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict gum,
                                  const int stride,
                                  const __global int *dev_ij,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  __global int *restrict err_flag,
                                  const int eflag, const int vflag,
@@ -371,7 +373,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -393,6 +395,7 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);
 
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 99e20db223..0a0f4f02be 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -113,7 +113,7 @@ _texture( q_tex,int2);
     dufld[5]=red_acc[5][tid];                                               \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
     t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
       (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
     t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@@ -147,7 +147,7 @@ _texture( q_tex,int2);
     _fieldp[5]=red_acc[5][tid];                                             \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
     f.x = _fieldp[0];                                                       \
     f.y = _fieldp[1];                                                       \
     f.z = _fieldp[2];                                                       \
@@ -174,7 +174,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -254,7 +254,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 t;                                                              \
+    acctyp3 t;                                                              \
     t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
       (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
     t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@@ -277,7 +277,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 f, fp;                                                          \
+    acctyp3 f, fp;                                                          \
     f.x = _fieldp[0];                                                       \
     f.y = _fieldp[1];                                                       \
     f.z = _fieldp[2];                                                       \
@@ -302,7 +302,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -391,7 +391,7 @@ _texture( q_tex,int2);
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -416,9 +416,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
-                                __global acctyp4 *restrict tep,
+                                __global acctyp3 *restrict tep,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
                                 const int t_per_atom, const numtyp aewald,
@@ -432,7 +432,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_charge();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -441,7 +441,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
     for (int l=0; l<6; l++) virial[l]=(acctyp)0;
   }
 
-  acctyp4 tq;
+  acctyp3 tq;
   tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
 
   const __global numtyp4* polar1 = &extra[0];
@@ -634,7 +634,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       frcx = sizik * frcx;
       frcy = sizik * frcy;
       frcz = sizik * frcz;
-      
+
       // compute the torque components for this interaction
 
       numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
@@ -717,7 +717,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag, const int inum,
                                  const int nall, const int nbor_pitch,
@@ -730,7 +730,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_charge();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -895,9 +895,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
-                                __global acctyp4 *restrict tep,
+                                __global acctyp3 *restrict tep,
                                 const int eflag, const int vflag, const int inum,
                                 const int nall, const int nbor_pitch,
                                 const int t_per_atom, const numtyp aewald,
@@ -910,7 +910,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_charge();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -919,7 +919,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
     for (int l=0; l<6; l++) virial[l]=(acctyp)0;
   }
 
-  acctyp4 tq;
+  acctyp3 tq;
   tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
 
   const __global numtyp4* polar1 = &extra[0];
@@ -1210,7 +1210,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict fieldp,
+                                __global acctyp3 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@@ -1390,7 +1390,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict fieldp,
+                                __global acctyp3 *restrict fieldp,
                                 const int inum,  const int nall,
                                 const int nbor_pitch, const int t_per_atom,
                                 const numtyp aewald, const numtyp off2,
@@ -1541,9 +1541,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
                             const __global int *dev_short_nbor,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
-                            __global acctyp4 *restrict tep,
+                            __global acctyp3 *restrict tep,
                             const int eflag, const int vflag, const int inum,
                             const int nall, const int nbor_pitch, const int t_per_atom,
                             const numtyp aewald, const numtyp felec,
@@ -1556,7 +1556,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_charge();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -1697,7 +1697,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
 
       // calculate the real space Ewald error function terms
-      
+
       int m;
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
@@ -2003,12 +2003,12 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
   if (ii<inum) {
 
     const int nlpts = (bsorder-1) / 2;
-    
+
     int istart = fast_mul(ii,4);
     const int igridx = igrid[istart];
     const int igridy = igrid[istart+1];
     const int igridz = igrid[istart+2];
-    
+
     // now istart is used to index thetai1, thetai2 and thetai3
     istart = fast_mul(ii,bsorder);
 
@@ -2202,7 +2202,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
     fdip_buf[7] = tuv110_1;
     fdip_buf[8] = tuv101_1;
     fdip_buf[9] = tuv011_1;
-    idx = ii;    
+    idx = ii;
     for (int m = 0; m < 10; m++) {
       fdip_phi1[idx] = fdip_buf[m];
       idx += inum;
@@ -2218,7 +2218,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
     fdip_buf[7] = tuv110_2;
     fdip_buf[8] = tuv101_2;
     fdip_buf[9] = tuv011_2;
-    idx = ii;    
+    idx = ii;
     for (int m = 0; m < 10; m++) {
       fdip_phi2[idx] = fdip_buf[m];
       idx += inum;
@@ -2244,7 +2244,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
     fdip_buf[17] = tuv102;
     fdip_buf[18] = tuv012;
     fdip_buf[19] = tuv111;
-    idx = ii;    
+    idx = ii;
     for (int m = 0; m < 20; m++) {
       fdip_sum_phi[idx] = fdip_buf[m];
       idx += inum;
@@ -2275,12 +2275,12 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
   if (ii<inum) {
 
     int nlpts = (bsorder-1) / 2;
-    
+
     int istart = fast_mul(ii,4);
     int igridx = igrid[istart];
     int igridy = igrid[istart+1];
     int igridz = igrid[istart+2];
-    
+
     // now istart is used to index thetai1, thetai2 and thetai3
     istart = fast_mul(ii,bsorder);
 
@@ -2410,7 +2410,7 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
     buf[18] = tuv012;
     buf[19] = tuv111;
 
-    int idx = ii;    
+    int idx = ii;
     for (int m = 0; m < 20; m++) {
       fphi[idx] = buf[m];
       idx += inum;
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 382cd140d9..6709f36b9a 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -31,7 +31,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
                    const __global numtyp *restrict sp_lj,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
@@ -41,7 +41,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -59,6 +59,7 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -110,7 +111,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@@ -144,7 +145,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -166,6 +167,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
 
     NOUNROLL
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
       #ifndef ONETYPE
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index d1f7e3791f..b6e686251f 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -31,7 +31,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@@ -47,7 +47,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -66,6 +66,7 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -118,7 +119,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@@ -139,7 +140,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -161,6 +162,7 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index 5c8a2d46b2..ec4290c924 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -36,7 +36,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag,  const int vflag,
                                const int inum, const int nbor_pitch,
@@ -59,7 +59,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -79,6 +79,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -156,7 +157,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
                                     const __global numtyp *restrict sp_lj_in,
                                     const __global int *dev_nbor,
                                     const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum, const int nbor_pitch,
@@ -182,7 +183,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -205,6 +206,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index c728967bc5..453faae8e1 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -36,7 +36,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int *dev_nbor,
                         const __global int *dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch,
@@ -59,7 +59,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -79,6 +79,7 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -147,7 +148,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@@ -173,7 +174,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -196,6 +197,7 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index 1804625649..01f84c8906 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -36,7 +36,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch,
@@ -60,7 +60,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -80,6 +80,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -154,7 +155,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int *dev_nbor,
                               const __global int *dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch,
@@ -181,7 +182,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -204,6 +205,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index 85af3c3433..443fefbceb 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -36,7 +36,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@@ -59,7 +59,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -79,6 +79,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -154,7 +155,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp *restrict sp_lj_in,
                                   const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
@@ -178,7 +179,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -201,6 +202,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu
index 39fc723736..19f39d27ec 100644
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@@ -94,7 +94,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@@ -117,7 +117,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -139,6 +139,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
     numtyp cut_coul = ucl_sqrt(cut_coulsq);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -215,7 +216,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum,  const int nbor_pitch,
@@ -239,7 +240,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -264,6 +265,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
     numtyp cut_coul = ucl_sqrt(cut_coulsq);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu
index df3398afd7..e3d459c082 100644
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@@ -39,7 +39,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj,
                          const __global int * dev_nbor,
                          const __global int * dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@@ -49,7 +49,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -67,6 +67,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -132,7 +133,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp *restrict sp_lj_in,
                               const __global int * dev_nbor,
                               const __global int * dev_packed,
-                              __global acctyp4 *restrict ans,
+                              __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
                               const int eflag, const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom) {
@@ -155,7 +156,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -176,6 +177,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index 5beedb0bbb..905120a878 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -38,7 +38,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch,
@@ -62,7 +62,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -88,6 +88,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul, r, prefactor, erfcc;
@@ -165,7 +166,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch,
@@ -190,7 +191,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -219,6 +220,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul, r, prefactor, erfcc;
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index 2eff2cd89b..4e161fcfe3 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -33,7 +33,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@@ -49,7 +49,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -68,6 +68,7 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -122,7 +123,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom) {
@@ -143,7 +144,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -165,6 +166,7 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_expand_coul_long.cu b/lib/gpu/lal_lj_expand_coul_long.cu
index abb3d5ca3f..24b8f1785f 100644
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@@ -36,7 +36,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch,
@@ -59,7 +59,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -79,6 +79,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -158,7 +159,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp *restrict sp_lj_in,
                                   const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
@@ -181,7 +182,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -204,6 +205,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu
index 4117cc1440..21125a4fae 100644
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@@ -34,7 +34,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
@@ -50,7 +50,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -68,6 +68,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -134,7 +135,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict sp_lj_in,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag, const int inum,
                                 const int nbor_pitch, const int t_per_atom) {
@@ -156,7 +157,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
     ljsw[tid]=ljsw_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -177,6 +178,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_smooth.cu b/lib/gpu/lal_lj_smooth.cu
index d4a99ed3a7..8b98be17f1 100644
--- a/lib/gpu/lal_lj_smooth.cu
+++ b/lib/gpu/lal_lj_smooth.cu
@@ -33,7 +33,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
                    const __global numtyp *restrict sp_lj,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
@@ -43,7 +43,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -61,8 +61,9 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
 
     numtyp force, r6inv, factor_lj, forcelj;
     numtyp r, t, tsq, fskin;
-    
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -76,10 +77,10 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
-      
+
       int mtype=itype*lj_types+jtype;
       if (rsq<lj1[mtype].z) {
-        
+
         numtyp r2inv=ucl_recip(rsq);
         if (rsq < lj1[mtype].w) {
           r6inv = r2inv*r2inv*r2inv;
@@ -135,7 +136,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@@ -169,7 +170,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -194,6 +195,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
 
     NOUNROLL
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
       #ifndef ONETYPE
       factor_lj = sp_lj[sbmask(j)];
@@ -236,7 +238,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
           if (rsq < lj1[mtype].w)
             e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
           else
-            e = ljsw0[mtype].x - ljsw[mtype].x*t - 
+            e = ljsw0[mtype].x - ljsw[mtype].x*t -
               ljsw[mtype].y*tsq/2.0 - ljsw[mtype].z*tsq*t/3.0 -
               ljsw[mtype].w*tsq*tsq/4.0 - lj3[mtype].z; //???
 
diff --git a/lib/gpu/lal_lj_spica.cu b/lib/gpu/lal_lj_spica.cu
index bae8bd57fa..e5976f7013 100644
--- a/lib/gpu/lal_lj_spica.cu
+++ b/lib/gpu/lal_lj_spica.cu
@@ -31,7 +31,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@@ -47,7 +47,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -66,6 +66,7 @@ __kernel void k_lj_spica(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -128,7 +129,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
@@ -149,7 +150,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -171,6 +172,7 @@ __kernel void k_lj_spica_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_lj_spica_long.cu b/lib/gpu/lal_lj_spica_long.cu
index 12da967f2e..e0d9bb7944 100644
--- a/lib/gpu/lal_lj_spica_long.cu
+++ b/lib/gpu/lal_lj_spica_long.cu
@@ -36,7 +36,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag,  const int vflag, const int inum,
                             const int nbor_pitch,
@@ -59,7 +59,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -79,6 +79,7 @@ __kernel void k_lj_spica_long(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
@@ -166,7 +167,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict sp_lj_in,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
-                                 __global acctyp4 *restrict ans,
+                                 __global acctyp3 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
                                  const int inum, const int nbor_pitch,
@@ -189,7 +190,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, e_coul, virial[6];
   if (EVFLAG) {
@@ -212,6 +213,7 @@ __kernel void k_lj_spica_long_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj, factor_coul;
diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu
index 063daf7256..26ceb38538 100644
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@@ -59,7 +59,7 @@ _texture( q_tex,int2);
 
 /* ----------------------------------------------------------------------
    GPU analogue of Atom::map inline method,
-   but now limited to map_array mapping style. 
+   but now limited to map_array mapping style.
    Map global ID to local index of atom.
 ---------------------------------------------------------------------- */
 ucl_inline int atom_mapping(const __global int *map, tagint glob) {
@@ -134,16 +134,16 @@ ucl_inline void compute_newsite(int iO, int  iH1, int  iH2,
 
 /* ----------------------------------------------------------------------
    Compute resulting forces (ans), energies and virial (engv).
-   An additional term is calculated based on the previously 
-   calculated values on the virlual sites (ansO), 
-   which should be distributed over the real atoms. 
+   An additional term is calculated based on the previously
+   calculated values on the virlual sites (ansO),
+   which should be distributed over the real atoms.
    For some hydrogens, the corresponding oxygens are
    not local atoms and the ansO value is not calculated.
    The required increase is calculated directly in the main function.
 ---------------------------------------------------------------------- */
 __kernel void k_lj_tip4p_long_distrib(
     const __global numtyp4 *restrict x_,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
     __global acctyp *restrict engv,
     const int eflag, const int vflag, const int inum,
     const int nbor_pitch, const int t_per_atom,
@@ -151,11 +151,11 @@ __kernel void k_lj_tip4p_long_distrib(
     const __global numtyp4 *restrict m,
     const int typeO, const int typeH,
     const numtyp alpha,
-    const __global numtyp *restrict q_, 
+    const __global numtyp *restrict q_,
     const __global acctyp4 *restrict ansO) {
 
   int i = BLOCK_ID_X*(BLOCK_SIZE_X)+THREAD_ID_X;
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
 
   if (i<inum) {
@@ -208,7 +208,7 @@ __kernel void k_lj_tip4p_long_distrib(
         engv[inum*engv_iter + i] += vM.z * (acctyp)(1 - alpha);
       }
     }
-    acctyp4 old=ans[i];
+    acctyp3 old=ans[i];
     old.x+=f.x;
     old.y+=f.y;
     old.z+=f.z;
@@ -219,7 +219,7 @@ __kernel void k_lj_tip4p_long_distrib(
 /* ----------------------------------------------------------------------
    Rebuild hneigh after the neighbor build.
    hneight stores local IDs of H1 and H2 for each local and ghost O
-   and local ID of O for each local H. 
+   and local ID of O for each local H.
 ---------------------------------------------------------------------- */
 __kernel void k_lj_tip4p_reneigh(
     const __global numtyp4 *restrict x_,
@@ -230,7 +230,7 @@ __kernel void k_lj_tip4p_reneigh(
     __global int *restrict hneigh,
     __global numtyp4 *restrict m,
     const int typeO, const int typeH,
-    const __global tagint *restrict tag, 
+    const __global tagint *restrict tag,
     const __global int *restrict map,
     const __global int *restrict sametag) {
 
@@ -298,7 +298,7 @@ __kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
       iO  = i;
       numtyp qO; fetch(qO,iO,q_tex);
       if (iH1>=0 && iH2>=0) {
-      	compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
+        compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
       } else {
         m[iO] = ix;
         m[iO].w = qO;
@@ -313,9 +313,9 @@ __kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
 /* ----------------------------------------------------------------------
    Compute initial value of force, energy and virial for each local particle.
    The values calculated on oxygens use the virtual charge position (m) and
-   they are stored in a separate  array (ansO) for further distribution 
+   they are stored in a separate  array (ansO) for further distribution
    in a separate kernel. For some hydrogens located on the boundary
-   of the local region, oxygens are non-local and the contribution 
+   of the local region, oxygens are non-local and the contribution
    of oxygen is calculated separately in this kernel for them .
 ---------------------------------------------------------------------- */
 __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
@@ -325,7 +325,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
     const __global numtyp *restrict sp_lj,
     const __global int * dev_nbor,
     const __global int * dev_packed,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
     __global acctyp *restrict engv,
     const int eflag, const int vflag, const int inum,
     const int nbor_pitch, const int t_per_atom,
@@ -344,7 +344,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_charge();
 
-  acctyp4 f, fO;
+  acctyp3 f;
+  acctyp4 fO;
   f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
   fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
   acctyp energy, e_coul, virial[6], vO[6];
@@ -386,6 +387,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj,factor_coul;
@@ -470,7 +472,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
             e_coul += prefactor*(_erfc-factor_coul);
           }
           if (EVFLAG && vflag) {
-            acctyp4 fd;
+            acctyp3 fd;
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
             fd.z = delz*force_coul;
@@ -645,7 +647,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
     const __global numtyp *restrict sp_lj_in,
     const __global int * dev_nbor,
     const __global int * dev_packed,
-    __global acctyp4 *restrict ans,
+    __global acctyp3 *restrict ans,
     __global acctyp *restrict engv,
     const int eflag, const int vflag, const int inum,
     const int nbor_pitch, const int t_per_atom,
@@ -674,7 +676,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
     if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
-  acctyp4 f, fO;
+  acctyp3 f;
+  acctyp4 fO;
   f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
   fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
   acctyp energy, e_coul, virial[6], vO[6];
@@ -717,6 +720,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
     }
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
       int j=dev_packed[nbor];
 
       numtyp factor_lj,factor_coul;
@@ -801,7 +805,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
             e_coul += prefactor*(_erfc-factor_coul);
           }
           if (EVFLAG && vflag) {
-            acctyp4 fd;
+            acctyp3 fd;
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
             fd.z = delz*force_coul;
diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu
index 61ac742321..10ae7286b3 100644
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@@ -31,7 +31,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
                     const __global numtyp *restrict sp_lj_in,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@@ -47,7 +47,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -66,6 +66,7 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -119,7 +120,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
                          const __global numtyp *restrict sp_lj_in,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@@ -139,7 +140,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
     mie3[tid]=mie3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -161,6 +162,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index b1c8f2673b..0cf66683e2 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -33,7 +33,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
@@ -49,7 +49,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -68,6 +68,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -120,7 +121,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom) {
@@ -141,7 +142,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
       mor2[tid]=mor2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -163,6 +164,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 359d9b75cb..49173c4c97 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -52,7 +52,7 @@ _texture_2d( pos_tex,int4);
   compute the id of the cell where the atoms belong to
 x: atom coordinates
 cell_id: cell ids
-particle_id: 
+particle_id:
 boxlo[0-2]: the lower left corner of the local box
 ncell[xyz]: the number of cells in xyz dims
 i_cell_size is the inverse cell size
@@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
 
 #endif
 
+#define SPECIAL_DATA_PRELOAD_SIZE 3
+#define UNROLL_FACTOR_LIST 4
+#define UNROLL_FACTOR_SPECIAL 2
+
 __kernel void kernel_special(__global int *dev_nbor,
                              __global int *host_nbor_list,
                              const __global int *host_numj,
@@ -526,23 +530,68 @@ __kernel void kernel_special(__global int *dev_nbor,
       list_end=list+fast_mul(numj,stride);
     }
 
-    for ( ; list<list_end; list+=stride) {
-      int nbor=*list;
-      tagint jtag=tag[nbor];
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+    tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
+    for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
+      special_preload[j] = special[ii + i*nt];
+    }
+#endif
 
-      int offset=ii;
-      for (int i=0; i<n3; i++) {
-        if (special[offset]==jtag) {
-          int which = 1;
-          if (i>=n1)
-            which++;
-          if (i>=n2)
-            which++;
-          nbor=nbor ^ (which << SBBITS);
-          *list=nbor;
+    for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
+      int nbor[UNROLL_FACTOR_LIST];
+      tagint jtag[UNROLL_FACTOR_LIST];
+      __global int* list_addr[UNROLL_FACTOR_LIST];
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        list_addr[l] = list + l*stride;
+        nbor[l] = *list_addr[l];
+      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        jtag[l] = tag[nbor[l]];
+      }
+
+      for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
+        tagint special_data[UNROLL_FACTOR_SPECIAL];
+        int which[UNROLL_FACTOR_SPECIAL];
+
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          which[c] = 1;
+          if (i + c < n3)
+          {
+#if SPECIAL_DATA_PRELOAD_SIZE > 0
+            if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
+              special_data[c] = special_preload[j];
+            }
+            else
+#endif
+              special_data[c] = special[ii + (i+c)*nt];
+          }
         }
-        offset+=nt;
+
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n1) {
+            which[k]++;
+          }
+        }
+        for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
+          if (i+k >= n2) {
+            which[k]++;
+          }
+          which[k] <<= SBBITS;
+        }
+        for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
+          if (i + c < n3) {
+            for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+              if (special_data[c] == jtag[l]) {
+                nbor[l]=nbor[l] ^ which[c];
+              }
+            }
+          }
+        }
+      }
+      for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
+        *list_addr[l] = nbor[l];
       }
     }
   } // if ii
 }
+
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index a8e929efe4..b838feca2a 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -217,7 +217,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
                      const grdtyp delxinv,  const grdtyp delyinv,
                      const grdtyp delzinv, const int order,
                      const int order2, const grdtyp qqrd2e_scale,
-                     __global acctyp4 *restrict ans) {
+                     __global acctyp3 *restrict ans) {
   __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
   __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
   __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
@@ -239,7 +239,7 @@ __kernel void interp(const __global numtyp4 *restrict x_,
     fetch(qs,ii,q_tex);
     qs*=qqrd2e_scale;
 
-    acctyp4 ek;
+    acctyp3 ek;
     ek.x=(acctyp)0.0;
     ek.y=(acctyp)0.0;
     ek.z=(acctyp)0.0;
diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h
index f432757cf1..a66eaca3ac 100644
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@@ -57,6 +57,7 @@
 #define MAX_SHARED_TYPES 11
 #define MAX_BIO_SHARED_TYPES 128
 #define PPPM_MAX_SPLINE 8
+#define NBOR_PREFETCH 0
 
 // -------------------------------------------------------------------------
 //                              KERNEL MACROS
diff --git a/lib/gpu/lal_pre_ocl_config.h b/lib/gpu/lal_pre_ocl_config.h
index 15c503c942..a854b223ba 100644
--- a/lib/gpu/lal_pre_ocl_config.h
+++ b/lib/gpu/lal_pre_ocl_config.h
@@ -23,7 +23,7 @@
 //   THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
 //   BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
 //   BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
-//   PPPM_MAX_SPLINE}
+//   PPPM_MAX_SPLINE, NBOR_PREFETCH}
 //
 //*************************************************************************/
 
@@ -39,15 +39,15 @@ const char * ocl_config_names[] =
   };
 const char * ocl_config_strings[] =
   {
-   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8",
-   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
-   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
+   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8,0",
+   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
+   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8,0",
 #ifdef _SINGLE_SINGLE
-   "INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
-   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
+   "INTEL_GPU,500,8,32,1,1,4,8,2,128,128,128,128,64,8,128,8,128,8,2",
+   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8,0",
 #else
-   "INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
-   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
+   "INTEL_GPU,500,8,32,1,1,2,8,2,128,128,128,128,64,8,128,8,128,8,2",
+   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8,0",
 #endif
-   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8"
+   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8,0"
   };
diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h
index ee1ab7b3e2..85e009e96b 100644
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@@ -57,6 +57,10 @@ struct _lgpu_float2 {
   float x; float y;
 };
 
+struct _lgpu_float3 {
+  float x; float y; float z;
+};
+
 struct _lgpu_float4 {
   float x; float y; float z; float w;
 };
@@ -65,6 +69,10 @@ struct _lgpu_double2 {
   double x; double y;
 };
 
+struct _lgpu_double3 {
+  double x; double y; double z;
+};
+
 struct _lgpu_double4 {
   double x; double y; double z; double w;
 };
@@ -75,6 +83,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_float2 &v) {
   return out;
 }
 
+inline std::ostream & operator<<(std::ostream &out, const _lgpu_float3 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_float4 &v) {
   out << v.x << " " << v.y << " " << v.z;
   return out;
@@ -85,6 +98,11 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double2 &v) {
   return out;
 }
 
+inline std::ostream & operator<<(std::ostream &out, const _lgpu_double3 &v) {
+  out << v.x << " " << v.y << " " << v.z;
+  return out;
+}
+
 inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
   out << v.x << " " << v.y << " " << v.z;
   return out;
@@ -97,8 +115,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION float
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_float2
+#define numtyp3 _lgpu_float3
 #define numtyp4 _lgpu_float4
 #define acctyp2 _lgpu_double2
+#define acctyp3 _lgpu_double3
 #define acctyp4 _lgpu_double4
 #endif
 
@@ -107,8 +127,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION double
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_double2
+#define numtyp3 _lgpu_double3
 #define numtyp4 _lgpu_double4
 #define acctyp2 _lgpu_double2
+#define acctyp3 _lgpu_double3
 #define acctyp4 _lgpu_double4
 #endif
 
@@ -117,8 +139,10 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define PRECISION float
 #define ACC_PRECISION float
 #define numtyp2 _lgpu_float2
+#define numtyp3 _lgpu_float3
 #define numtyp4 _lgpu_float4
 #define acctyp2 _lgpu_float2
+#define acctyp3 _lgpu_float3
 #define acctyp4 _lgpu_float4
 #endif
 
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index b3243c3b2e..b9ff18bbd4 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -93,6 +93,13 @@
 //     Definition:   Maximum order for splines in PPPM
 //     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
 //
+//  NBOR_PREFETCH
+//     Definition:   Control use of prefetch for neighbor indices
+//                   0 = No prefetch
+//                   1 = Prefetch using standard API
+//                   2 = Prefetch using Intel intrinsics
+//     Restrictions: NBOR_PREFETCH forced to 0 when LAL_DISABLE_PREFETCH
+//                   is defined in library build
 //*************************************************************************/
 
 // -------------------------------------------------------------------------
@@ -101,6 +108,7 @@
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_pre_cuda_hip.h"
+#define ucl_prefetch(p)
 #define ucl_pow pow
 #endif
 
@@ -169,7 +177,7 @@
 #define ucl_abs fabs
 #define ucl_erfc erfc
 
-#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
+#if (FAST_MATH > 0) && !defined(_DOUBLE_DOUBLE)
 
 #define ucl_exp native_exp
 #define ucl_pow pow
@@ -285,6 +293,55 @@
   #define simd_size() SIMD_SIZE
 #endif
 
+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - PREFETCH
+// -------------------------------------------------------------------------
+
+#if (NBOR_PREFETCH == 0)
+#define ucl_prefetch(p)
+#endif
+
+#if (NBOR_PREFETCH == 1)
+inline void ucl_prefetch(const __global int *p) {
+  prefetch(p, 1);
+}
+#endif
+
+#if (NBOR_PREFETCH == 2)
+// Load message caching control
+enum LSC_LDCC {
+  LSC_LDCC_DEFAULT,
+  LSC_LDCC_L1UC_L3UC,   //1 Override to L1 uncached and L3 uncached
+  LSC_LDCC_L1UC_L3C,    //1 Override to L1 uncached and L3 cached
+  LSC_LDCC_L1C_L3UC,    //1 Override to L1 cached and L3 uncached
+  LSC_LDCC_L1C_L3C,     //1 Override to L1 cached and L3 cached
+  LSC_LDCC_L1S_L3UC,    //1 Override to L1 streaming load and L3 uncached
+  LSC_LDCC_L1S_L3C,     //1 Override to L1 streaming load and L3 cached
+  LSC_LDCC_L1IAR_L3C,   //1 Override to L1 invalidate-after-read, and L3 cached
+};
+
+void __builtin_IB_lsc_prefetch_global_uint(const __global uint *base,
+                                           int elemOff,
+                                           enum LSC_LDCC cacheOpt); //D32V1
+
+inline void ucl_prefetch(const __global int *p) {
+  __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0,
+                                        LSC_LDCC_L1C_L3UC);
+}
+#endif
+
+struct _lgpu_float3 {
+  float x; float y; float z;
+};
+struct _lgpu_double3 {
+  double x; double y; double z;
+};
+#ifdef _SINGLE_SINGLE
+#define acctyp3 struct _lgpu_float3
+#else
+#define acctyp3 struct _lgpu_double3
+#endif
+
 // -------------------------------------------------------------------------
 //                            END OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
@@ -301,6 +358,9 @@
 #define numtyp4 double4
 #define acctyp double
 #define acctyp2 double2
+#ifndef acctyp3
+#define acctyp3 double3
+#endif
 #define acctyp4 double4
 #endif
 
@@ -310,6 +370,9 @@
 #define numtyp4 float4
 #define acctyp double
 #define acctyp2 double2
+#ifndef acctyp3
+#define acctyp3 double3
+#endif
 #define acctyp4 double4
 #endif
 
@@ -319,6 +382,9 @@
 #define numtyp4 float4
 #define acctyp float
 #define acctyp2 float2
+#ifndef acctyp3
+#define acctyp3 float3
+#endif
 #define acctyp4 float4
 #endif
 
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index c69a338749..318bdfdd69 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -32,6 +32,9 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
   return ans;
 }
 
+#ifdef INTEL_OCL
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
 __kernel void k_resquared(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict q,
                           const __global numtyp4 *restrict shape,
@@ -41,7 +44,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
                           const int ntypes,
                           const __global int *dev_nbor,
                           const int stride,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           const int astride,
                           __global acctyp *restrict engv,
                           __global int *restrict err_flag,
@@ -62,7 +65,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
   const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
   const numtyp cr60=ucl_cbrt((numtyp)60.0);
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, virial[6];
@@ -122,6 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
       int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp
index 17c508f1f1..e53cecd83d 100644
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@@ -105,28 +105,32 @@ void re_gpu_clear() {
                 tagint **special, const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                double **host_quat);
+                const int *ellipsoid, const EllipsoidBonus *bonus);
 
 int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist,
-                       int **jnum, const double cpu_time, bool &success,
-                       double **host_quat) {
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus) {
   return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi,
                       tag, nspecial, special, eflag, vflag, eatom, vatom,
-                      host_start, ilist, jnum, cpu_time, success, host_quat);
+                      host_start, ilist, jnum, cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }
 
 int * re_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double **host_quat) {
+                     const double cpu_time, bool &success,
+                     const int *ellipsoid, const void *bonus) {
   return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                       numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
-                      cpu_time, success, host_quat);
+                      cpu_time, success, ellipsoid,
+                      static_cast<const EllipsoidBonus *>(bonus));
 }
 
 // ---------------------------------------------------------------------------
@@ -135,4 +139,3 @@ int * re_gpu_compute(const int ago, const int inum_full, const int nall,
 double re_gpu_bytes() {
   return REMF.host_memory_usage();
 }
-
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index ca1b08facd..b3347fcb18 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -86,7 +86,7 @@
         ap1+=astride;                                                        \
       }                                                                      \
     }                                                                        \
-    acctyp4 old=ans[ii];                                                     \
+    acctyp3 old=ans[ii];                                                     \
     old.x+=f.x;                                                              \
     old.y+=f.y;                                                              \
     old.z+=f.z;                                                              \
@@ -131,7 +131,7 @@
         ap1+=astride;                                                       \
       }                                                                     \
     }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
                                            const int ntypes,
                                            const __global int *dev_nbor,
                                            const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                            const int astride,
                                            __global acctyp *restrict engv,
                                            __global int *restrict err_flag,
@@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
   const numtyp solv_f_r =
      (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
-  acctyp4 f, tor;
+  acctyp3 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
   acctyp energy, virial[6];
@@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
       int j=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
@@ -409,7 +410,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                            const int ntypes,
                                            const __global int *dev_nbor,
                                            const int stride,
-                                           __global acctyp4 *restrict ans,
+                                           __global acctyp3 *restrict ans,
                                            __global acctyp *restrict engv,
                                            __global int *restrict err_flag,
                                            const int eflag, const int vflag,
@@ -435,7 +436,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   const numtyp solv_f_r =
     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -454,6 +455,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_nbor+nbor+n_stride);
       int i=dev_nbor[nbor];
       factor_lj = sp_lj[sbmask(i)];
       i &= NEIGHMASK;
@@ -610,7 +612,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict gum,
                              const int stride,
                              const __global int *dev_ij,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              __global int *restrict err_flag,
                              const int eflag, const int vflag, const int start,
@@ -628,7 +630,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   sp_lj[2]=gum[2];
   sp_lj[3]=gum[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -647,6 +649,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);
 
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -697,7 +700,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp *restrict gum,
                                   const int stride,
                                   const __global int *dev_ij,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   __global int *restrict err_flag,
                                   const int eflag, const int vflag,
@@ -721,7 +724,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -743,6 +746,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_ij+nbor+n_stride);
 
       int j=dev_ij[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu
index 74ac0e0c97..d445f9f469 100644
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@@ -32,7 +32,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
                      const __global numtyp *restrict sp_lj_in,
                      const __global int *dev_nbor,
                      const __global int *dev_packed,
-                     __global acctyp4 *restrict ans,
+                     __global acctyp3 *restrict ans,
                      __global acctyp *restrict engv,
                      const int eflag, const int vflag, const int inum,
                      const int nbor_pitch, const int t_per_atom) {
@@ -48,7 +48,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -67,6 +67,7 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -119,7 +120,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict sp_lj_in,
                           const __global int *dev_nbor,
                           const __global int *dev_packed,
-                          __global acctyp4 *restrict ans,
+                          __global acctyp3 *restrict ans,
                           __global acctyp *restrict engv,
                           const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
@@ -137,7 +138,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -159,6 +160,7 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index a974c5f193..a5f23d4d69 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -57,7 +57,7 @@ _texture( sw3_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -116,7 +116,7 @@ _texture( sw3_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -194,7 +194,7 @@ _texture( sw3_tex,int4);
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -265,7 +265,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                    const __global numtyp4 * restrict c_14,
                    const __global numtyp2 * restrict c_56,
                    const int ntypes, const __global int * dev_nbor,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom,
@@ -282,7 +282,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE];
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -461,7 +461,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                 const __global numtyp2 *restrict sw_pre3,
                                 const int ntypes,
                                 const __global int * dev_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
@@ -480,7 +480,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
   const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -579,7 +579,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                              const __global numtyp2 *restrict sw_pre3,
                              const int ntypes, const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
@@ -598,7 +598,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
   const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -701,7 +701,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global numtyp2 *restrict sw_pre3,
                              const int ntypes, const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
@@ -720,7 +720,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
   const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index eb29218712..8623e9de72 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -49,7 +49,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
                       const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
-                      __global acctyp4 *restrict ans,
+                      __global acctyp3 *restrict ans,
                       __global acctyp *restrict engv,
                       const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom,
@@ -66,7 +66,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -87,6 +87,7 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -146,7 +147,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
-                           __global acctyp4 *restrict ans,
+                           __global acctyp3 *restrict ans,
                            __global acctyp *restrict engv,
                            const int eflag, const int vflag, const int inum,
                            const int nbor_pitch, const int t_per_atom,
@@ -165,7 +166,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -189,6 +190,7 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -251,7 +253,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom,
@@ -268,7 +270,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -289,6 +291,7 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -352,7 +355,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
                                   const __global numtyp *restrict sp_lj_in,
                                   const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
@@ -371,7 +374,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -395,6 +398,7 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -461,7 +465,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom,
@@ -478,7 +482,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -499,6 +503,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -569,7 +574,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
                                   const __global numtyp* sp_lj_in,
                                   const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *ans,
+                                  __global acctyp3 *ans,
                                   __global acctyp *engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
@@ -588,7 +593,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -611,6 +616,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -686,7 +692,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
                              const __global numtyp* sp_lj_in,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
-                             __global acctyp4 *ans,
+                             __global acctyp3 *ans,
                              __global acctyp *engv,
                              const int eflag, const int vflag, const int inum,
                              const int nbor_pitch, const int t_per_atom,
@@ -703,7 +709,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -724,6 +730,7 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -792,7 +799,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
                                   const __global numtyp* sp_lj_in,
                                   const __global int *dev_nbor,
                                   const __global int *dev_packed,
-                                  __global acctyp4 *ans,
+                                  __global acctyp3 *ans,
                                   __global acctyp *engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
@@ -811,7 +818,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -835,6 +842,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index feab8bb5c0..b72668f6c4 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -132,7 +132,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -211,7 +211,7 @@ _texture_2d( pos_tex,int4);
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -448,7 +448,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
@@ -472,7 +472,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
   const numtyp ijparam_bigd = ts2_in[ONETYPE3].w;
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -553,7 +553,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                      const __global acctyp2 *restrict zetaij,
                                      const __global acctyp *restrict zetaij_e,
                                      const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
@@ -585,7 +585,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
   const numtyp gamma = ts4_in[ONETYPE3].w;
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -728,7 +728,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                   const __global acctyp *restrict zetaij_e,
                                   const __global int * dev_nbor,
                                   const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
@@ -760,7 +760,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
   const numtyp gamma = ts4_in[ONETYPE3].w;
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -950,7 +950,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                     const __global acctyp *restrict zetaij_e,
                                     const __global int * dev_nbor,
                                     const __global int * dev_ilist,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum,  const int nbor_pitch,
@@ -982,7 +982,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
   const numtyp gamma = ts4_in[ONETYPE3].w;
   #endif
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 1eb57683d5..c6e7a8655e 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -63,7 +63,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -131,7 +131,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -209,7 +209,7 @@ _texture_2d( pos_tex,int4);
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -417,7 +417,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
@@ -434,7 +434,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
     ts2[tid]=ts2_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -511,7 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                      const int nelements, const int nparams,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
@@ -535,7 +535,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -676,7 +676,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
                                   const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
@@ -700,7 +700,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -890,7 +890,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
                                         const __global int * dev_ilist,
-                                        __global acctyp4 *restrict ans,
+                                        __global acctyp3 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
@@ -914,7 +914,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 6250fa55de..f9a5e8f69b 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -81,7 +81,7 @@ _texture( ts6_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -149,7 +149,7 @@ _texture( ts6_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -227,7 +227,7 @@ _texture( ts6_tex,int4);
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -443,7 +443,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
@@ -462,7 +462,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
     ts6[tid]=ts6_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -544,7 +544,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                      const int nelements, const int nparams,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
-                                     __global acctyp4 *restrict ans,
+                                     __global acctyp3 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
@@ -566,7 +566,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -703,7 +703,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
                                   const __global int * dev_ilist,
-                                  __global acctyp4 *restrict ans,
+                                  __global acctyp3 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
@@ -725,7 +725,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -908,7 +908,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
                                         const __global int * dev_ilist,
-                                        __global acctyp4 *restrict ans,
+                                        __global acctyp3 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
@@ -930,7 +930,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
diff --git a/lib/gpu/lal_ufm.cu b/lib/gpu/lal_ufm.cu
index 9d6c7b978a..e9ec06462d 100644
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@@ -33,7 +33,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
                    const __global numtyp *restrict sp_lj,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int t_per_atom) {
@@ -43,7 +43,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -61,6 +61,7 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -109,7 +110,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
                         const __global numtyp *restrict sp_lj_in,
                         const __global int * dev_nbor,
                         const __global int * dev_packed,
-                        __global acctyp4 *restrict ans,
+                        __global acctyp3 *restrict ans,
                         __global acctyp *restrict engv,
                         const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
@@ -130,7 +131,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
       uf3[tid]=uf3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -151,6 +152,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index f4595f4715..b753e1db5a 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -73,7 +73,7 @@ _texture( param5_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -132,7 +132,7 @@ _texture( param5_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -210,7 +210,7 @@ _texture( param5_tex,int4);
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
   if (offset==0 && ii<inum) {                                               \
-    acctyp4 old=ans[ii];                                                    \
+    acctyp3 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
@@ -247,6 +247,7 @@ __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
     const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      ucl_prefetch(dev_packed+nbor+nbor_pitch);
       int sj=dev_packed[nbor];
       int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -283,7 +284,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                    const __global int *restrict elem2param,
                    const int nelements,
                    const __global int * dev_packed,
-                   __global acctyp4 *restrict ans,
+                   __global acctyp3 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
                    const int nbor_pitch, const int ev_stride) {
@@ -291,7 +292,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
 
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -313,6 +314,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      ucl_prefetch(dev_packed+nbor+nbor_pitch);
 
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -489,7 +491,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                 const __global int *restrict elem2param,
                                 const int nelements,
                                 const __global int * dev_nbor,
-                                __global acctyp4 *restrict ans,
+                                __global acctyp3 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
@@ -504,7 +506,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
 
   local_allocate_store_three();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -612,7 +614,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                              const int nelements,
                              const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
@@ -627,7 +629,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
 
   local_allocate_store_three();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -743,7 +745,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                              const int nelements,
                              const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             __global acctyp4 *restrict ans,
+                             __global acctyp3 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
@@ -758,7 +760,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
 
   local_allocate_store_three();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index 6ebd2dc06d..4a92484003 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -30,7 +30,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
                        const __global numtyp *restrict sp_lj_in,
                        const __global int *dev_nbor,
                        const __global int *dev_packed,
-                       __global acctyp4 *restrict ans,
+                       __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
@@ -46,7 +46,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -65,6 +65,7 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -118,7 +119,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict sp_lj_in,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
-                            __global acctyp4 *restrict ans,
+                            __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
@@ -136,7 +137,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -158,6 +159,7 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index 847ffa6d80..044e7ac9c3 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -40,7 +40,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
                                const __global numtyp *restrict sp_lj_in,
                                const __global int *dev_nbor,
                                const __global int *dev_packed,
-                               __global acctyp4 *restrict ans,
+                               __global acctyp3 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom,
@@ -57,7 +57,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -77,6 +77,7 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
@@ -131,7 +132,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
                                     const __global numtyp *restrict sp_lj_in,
                                     const __global int *dev_nbor,
                                     const __global int *dev_packed,
-                                    __global acctyp4 *restrict ans,
+                                    __global acctyp3 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
                                     const int inum, const int nbor_pitch,
@@ -150,7 +151,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -173,6 +174,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
 
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       factor_lj = sp_lj[sbmask(j)];
diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu
index 2a7d4795da..880ce918b1 100644
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@@ -88,7 +88,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
                     const int lj_types,
                     const __global int *dev_nbor,
                     const __global int *dev_packed,
-                    __global acctyp4 *restrict ans,
+                    __global acctyp3 *restrict ans,
                     __global acctyp *restrict engv,
                     const int eflag, const int vflag, const int inum,
                     const int nbor_pitch, const int t_per_atom) {
@@ -98,7 +98,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
   int n_stride;
   local_allocate_store_pair();
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -116,6 +116,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
     int itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -179,7 +180,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
                          const numtyp cut_inner,
                          const __global int *dev_nbor,
                          const __global int *dev_packed,
-                         __global acctyp4 *restrict ans,
+                         __global acctyp3 *restrict ans,
                          __global acctyp *restrict engv,
                          const int eflag, const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
@@ -198,7 +199,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     coeff3[tid]=coeff3_in[tid];
   }
 
-  acctyp4 f;
+  acctyp3 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
   acctyp energy, virial[6];
   if (EVFLAG) {
@@ -219,6 +220,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
+      ucl_prefetch(dev_packed+nbor+n_stride);
 
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 23191c12c8..92f4f256b2 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -290,6 +290,20 @@ void FixGPU::init()
 
 void FixGPU::setup(int vflag)
 {
+  // See if we should overlap topology list builds on CPU with work on GPU
+  int overlap_topo = 0;
+  if ((atom->molecular != Atom::ATOMIC)) {
+    PairHybrid *ph = reinterpret_cast<PairHybrid *>(force->pair_match("^hybrid",0));
+    if (ph) {
+      for (int isub=0; isub < ph->nstyles; ++isub) {
+        if (force->pair_match("gpu",0,isub)) overlap_topo = 1;
+      }
+    } else {
+      if (force->pair_match("gpu",0)) overlap_topo = 1;
+    }
+  }
+  if (overlap_topo) neighbor->set_overlap_topo(1);
+
   if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
     if (neighbor->exclude_setting() != 0)
       error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds");
diff --git a/src/GPU/fix_nve_asphere_gpu.cpp b/src/GPU/fix_nve_asphere_gpu.cpp
index 06d1d7a7ca..481f44bb63 100644
--- a/src/GPU/fix_nve_asphere_gpu.cpp
+++ b/src/GPU/fix_nve_asphere_gpu.cpp
@@ -243,12 +243,7 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
     // update angular momentum by 1/2 step
     if (igroup == 0) {
       #if (LAL_USE_OMP_SIMD == 1)
-        // Workaround for compiler bug
-        #ifdef __INTEL_COMPILER
-        #pragma simd
-        #else
-        #pragma omp simd
-        #endif
+      #pragma omp simd
       #endif
       for (int i = ifrom; i < ito; i++) {
         double *quat = bonus[ellipsoid[i]].quat;
@@ -257,12 +252,7 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
       }
     } else {
       #if (LAL_USE_OMP_SIMD == 1)
-        // Workaround for compiler bug
-        #ifdef __INTEL_COMPILER
-        #pragma simd
-        #else
-        #pragma omp simd
-        #endif
+      #pragma omp simd
       #endif
       for (int i = ifrom; i < ito; i++) {
         if (mask[i] & groupbit) {
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index fd423486fd..1221db66b1 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -155,6 +155,15 @@ PairAmoebaGPU::~PairAmoebaGPU()
   amoeba_gpu_clear();
 }
 
+/* ---------------------------------------------------------------------- */
+
+void PairAmoebaGPU::compute(int eflag, int vflag)
+{
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
+  PairAmoeba::compute(eflag, vflag);
+}
+
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index be53f7ef50..75f0d26336 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -28,6 +28,7 @@ class PairAmoebaGPU : public PairAmoeba {
  public:
   PairAmoebaGPU(LAMMPS *lmp);
   ~PairAmoebaGPU() override;
+  void compute(int, int) override;
   void init_style() override;
   double memory_usage() override;
 
diff --git a/src/GPU/pair_beck_gpu.cpp b/src/GPU/pair_beck_gpu.cpp
index 3c21a99105..8d057fd317 100644
--- a/src/GPU/pair_beck_gpu.cpp
+++ b/src/GPU/pair_beck_gpu.cpp
@@ -109,6 +109,8 @@ void PairBeckGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_born_coul_long_cs_gpu.cpp b/src/GPU/pair_born_coul_long_cs_gpu.cpp
index 788a46e2cb..798caeb97a 100644
--- a/src/GPU/pair_born_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_cs_gpu.cpp
@@ -129,6 +129,8 @@ void PairBornCoulLongCSGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_born_coul_long_gpu.cpp b/src/GPU/pair_born_coul_long_gpu.cpp
index 629f716fd6..ca12f03070 100644
--- a/src/GPU/pair_born_coul_long_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_gpu.cpp
@@ -123,6 +123,8 @@ void PairBornCoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
index 214a9575be..9858015622 100644
--- a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
@@ -117,6 +117,8 @@ void PairBornCoulWolfCSGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_born_coul_wolf_gpu.cpp b/src/GPU/pair_born_coul_wolf_gpu.cpp
index 02a671adc9..ce9956d232 100644
--- a/src/GPU/pair_born_coul_wolf_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_gpu.cpp
@@ -114,6 +114,8 @@ void PairBornCoulWolfGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_born_gpu.cpp b/src/GPU/pair_born_gpu.cpp
index 905278cdb7..9499cd7307 100644
--- a/src/GPU/pair_born_gpu.cpp
+++ b/src/GPU/pair_born_gpu.cpp
@@ -109,6 +109,8 @@ void PairBornGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp
index 125ffbfbbd..b6e1e8fbed 100644
--- a/src/GPU/pair_buck_coul_cut_gpu.cpp
+++ b/src/GPU/pair_buck_coul_cut_gpu.cpp
@@ -111,6 +111,8 @@ void PairBuckCoulCutGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp
index ca90b3e869..adae92d1ac 100644
--- a/src/GPU/pair_buck_coul_long_gpu.cpp
+++ b/src/GPU/pair_buck_coul_long_gpu.cpp
@@ -120,6 +120,8 @@ void PairBuckCoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp
index d6dcdf30bc..4e11a2ec2d 100644
--- a/src/GPU/pair_buck_gpu.cpp
+++ b/src/GPU/pair_buck_gpu.cpp
@@ -107,6 +107,8 @@ void PairBuckGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_colloid_gpu.cpp b/src/GPU/pair_colloid_gpu.cpp
index c0e85907bb..510c4ef12f 100644
--- a/src/GPU/pair_colloid_gpu.cpp
+++ b/src/GPU/pair_colloid_gpu.cpp
@@ -109,6 +109,8 @@ void PairColloidGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_coul_cut_gpu.cpp b/src/GPU/pair_coul_cut_gpu.cpp
index d48ee1cb7b..240ed2f91e 100644
--- a/src/GPU/pair_coul_cut_gpu.cpp
+++ b/src/GPU/pair_coul_cut_gpu.cpp
@@ -108,6 +108,8 @@ void PairCoulCutGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_coul_debye_gpu.cpp b/src/GPU/pair_coul_debye_gpu.cpp
index ed9781c016..7d1fe8d546 100644
--- a/src/GPU/pair_coul_debye_gpu.cpp
+++ b/src/GPU/pair_coul_debye_gpu.cpp
@@ -109,6 +109,8 @@ void PairCoulDebyeGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_coul_dsf_gpu.cpp b/src/GPU/pair_coul_dsf_gpu.cpp
index a4837ed8cb..bf207caf60 100644
--- a/src/GPU/pair_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_coul_dsf_gpu.cpp
@@ -118,6 +118,8 @@ void PairCoulDSFGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_coul_long_cs_gpu.cpp b/src/GPU/pair_coul_long_cs_gpu.cpp
index 921a294721..79c4c4ab7c 100644
--- a/src/GPU/pair_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_coul_long_cs_gpu.cpp
@@ -123,6 +123,8 @@ void PairCoulLongCSGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_coul_long_gpu.cpp b/src/GPU/pair_coul_long_gpu.cpp
index 0b773882b2..7ecb052f69 100644
--- a/src/GPU/pair_coul_long_gpu.cpp
+++ b/src/GPU/pair_coul_long_gpu.cpp
@@ -117,6 +117,8 @@ void PairCoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp
index 716978deac..e4657cf2eb 100644
--- a/src/GPU/pair_dpd_gpu.cpp
+++ b/src/GPU/pair_dpd_gpu.cpp
@@ -256,6 +256,8 @@ void PairDPDGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp
index 029bf7245e..4a7b05fd2c 100644
--- a/src/GPU/pair_dpd_tstat_gpu.cpp
+++ b/src/GPU/pair_dpd_tstat_gpu.cpp
@@ -272,6 +272,8 @@ void PairDPDTstatGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp
index d1d73e415c..4b7693e989 100644
--- a/src/GPU/pair_eam_alloy_gpu.cpp
+++ b/src/GPU/pair_eam_alloy_gpu.cpp
@@ -138,6 +138,8 @@ void PairEAMAlloyGPU::compute(int eflag, int vflag)
     eam_alloy_gpu_compute_force(nullptr, eflag, vflag, eflag_atom, vflag_atom);
   else
     eam_alloy_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom);
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp
index c1a4c74d52..9da4e20a6f 100644
--- a/src/GPU/pair_eam_fs_gpu.cpp
+++ b/src/GPU/pair_eam_fs_gpu.cpp
@@ -138,6 +138,8 @@ void PairEAMFSGPU::compute(int eflag, int vflag)
     eam_fs_gpu_compute_force(nullptr, eflag, vflag, eflag_atom, vflag_atom);
   else
     eam_fs_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom);
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp
index 17af6cfb22..4cb7c7f749 100644
--- a/src/GPU/pair_eam_gpu.cpp
+++ b/src/GPU/pair_eam_gpu.cpp
@@ -136,6 +136,8 @@ void PairEAMGPU::compute(int eflag, int vflag)
     eam_gpu_compute_force(nullptr, eflag, vflag, eflag_atom, vflag_atom);
   else
     eam_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom);
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_gauss_gpu.cpp b/src/GPU/pair_gauss_gpu.cpp
index 17b9e9a650..e6e4ccae1b 100644
--- a/src/GPU/pair_gauss_gpu.cpp
+++ b/src/GPU/pair_gauss_gpu.cpp
@@ -106,6 +106,8 @@ void PairGaussGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
index 5f12b1eaf4..c0b0c2ecb0 100644
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -35,33 +35,39 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, const double mu,
-                double **shape, double **well, double **cutsq, double **sigma, double **epsilon,
-                double *host_lshape, int **form, double **host_lj1, double **host_lj2,
-                double **host_lj3, double **host_lj4, double **offset, double *special_lj,
-                const int nlocal, const int nall, const int max_nbors, const int maxspecial,
+int gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
+                const double mu, double **shape, double **well, double **cutsq,
+                double **sigma, double **epsilon, double *host_lshape,
+                int **form, double **host_lj1, double **host_lj2,
+                double **host_lj3, double **host_lj4, double **offset,
+                double *special_lj, const int nlocal, const int nall,
+                const int max_nbors, const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen);
 void gb_gpu_clear();
-int **gb_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x,
-                       int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial,
-                       tagint **special, const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist, int **jnum,
-                       const double cpu_time, bool &success, double **host_quat);
-int *gb_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type,
-                    int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start, const double cpu_time,
-                    bool &success, double **host_quat);
+int **gb_gpu_compute_n(const int ago, const int inum, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus);
+int *gb_gpu_compute(const int ago, const int inum, const int nall,
+                    double **host_x, int *host_type, int *ilist, int *numj,
+                    int **firstneigh, const bool eflag, const bool vflag,
+                    const bool eatom, const bool vatom, int &host_start,
+                    const double cpu_time, bool &success, const int *ellipsoid,
+                    const void *bonus);
 double gb_gpu_bytes();
 
 enum { SPHERE_SPHERE, SPHERE_ELLIPSE, ELLIPSE_SPHERE, ELLIPSE_ELLIPSE };
 
 /* ---------------------------------------------------------------------- */
 
-PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), gpu_mode(GPU_FORCE)
+PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp),
+                                                gpu_mode(GPU_FORCE)
 {
-  quat_nmax = 0;
   reinitflag = 0;
-  quat = nullptr;
   suffix_flag |= Suffix::GPU;
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -74,7 +80,6 @@ PairGayBerneGPU::~PairGayBerneGPU()
 {
   gb_gpu_clear();
   cpu_time = 0.0;
-  memory->destroy(quat);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -89,21 +94,8 @@ void PairGayBerneGPU::compute(int eflag, int vflag)
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
 
-  if (nall > quat_nmax) {
-    quat_nmax = static_cast<int>(1.1 * nall);
-    memory->grow(quat, quat_nmax, 4, "pair:quat");
-  }
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   int *ellipsoid = atom->ellipsoid;
-  for (int i = 0; i < nall; i++) {
-    int qi = ellipsoid[i];
-    if (qi > -1) {
-      quat[i][0] = bonus[qi].quat[0];
-      quat[i][1] = bonus[qi].quat[1];
-      quat[i][2] = bonus[qi].quat[2];
-      quat[i][3] = bonus[qi].quat[3];
-    }
-  }
 
   if (gpu_mode != GPU_FORCE) {
     double sublo[3], subhi[3];
@@ -119,19 +111,24 @@ void PairGayBerneGPU::compute(int eflag, int vflag)
     }
     inum = atom->nlocal;
     firstneigh =
-        gb_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag,
-                         atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom,
-                         host_start, &ilist, &numneigh, cpu_time, success, quat);
+        gb_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo,
+                         subhi, atom->tag, atom->nspecial, atom->special,
+                         eflag, vflag, eflag_atom, vflag_atom,
+                         host_start, &ilist, &numneigh, cpu_time, success,
+                         ellipsoid, bonus);
   } else {
     inum = list->inum;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
-    ilist = gb_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, list->ilist, numneigh,
-                           firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                           success, quat);
+    ilist = gb_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+                           list->ilist, numneigh, firstneigh, eflag, vflag,
+                           eflag_atom, vflag_atom, host_start, cpu_time,
+                           success, ellipsoid, bonus);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
@@ -185,14 +182,13 @@ void PairGayBerneGPU::init_style()
   if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
   int mnf = 5e-2 * neighbor->oneatom;
   int success =
-      gb_gpu_init(atom->ntypes + 1, gamma, upsilon, mu, shape2, well, cutsq, sigma, epsilon, lshape,
-                  form, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal,
-                  atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen);
+      gb_gpu_init(atom->ntypes + 1, gamma, upsilon, mu, shape2, well, cutsq,
+                  sigma, epsilon, lshape, form, lj1, lj2, lj3, lj4, offset,
+                  force->special_lj, atom->nlocal, atom->nlocal + atom->nghost,
+                  mnf, maxspecial, cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success, error, world);
 
   if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL);
-  quat_nmax = static_cast<int>(1.1 * (atom->nlocal + atom->nghost));
-  memory->grow(quat, quat_nmax, 4, "pair:quat");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -200,12 +196,13 @@ void PairGayBerneGPU::init_style()
 double PairGayBerneGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
-  return bytes + memory->usage(quat, quat_nmax) + gb_gpu_bytes();
+  return bytes + gb_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
-void PairGayBerneGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist,
+void PairGayBerneGPU::cpu_compute(int start, int inum, int eflag,
+                                  int /* vflag */, int *ilist,
                                   int *numneigh, int **firstneigh)
 {
   int i, j, ii, jj, jnum, itype, jtype;
diff --git a/src/GPU/pair_gayberne_gpu.h b/src/GPU/pair_gayberne_gpu.h
index 89d21b9046..1ce760352c 100644
--- a/src/GPU/pair_gayberne_gpu.h
+++ b/src/GPU/pair_gayberne_gpu.h
@@ -38,8 +38,6 @@ class PairGayBerneGPU : public PairGayBerne {
  private:
   int gpu_mode;
   double cpu_time;
-  int quat_nmax;
-  double **quat;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 9d286d5db7..8287a7c09d 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -172,6 +172,15 @@ PairHippoGPU::~PairHippoGPU()
   hippo_gpu_clear();
 }
 
+/* ---------------------------------------------------------------------- */
+
+void PairAmoebaGPU::compute(int eflag, int vflag)
+{
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
+  PairAmoeba::compute(eflag, vflag);
+}
+
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index d160446d77..50f362bafc 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -28,6 +28,7 @@ class PairHippoGPU : public PairAmoeba {
  public:
   PairHippoGPU(LAMMPS *lmp);
   ~PairHippoGPU() override;
+  void compute(int, int) override;
   void init_style() override;
   double memory_usage() override;
 
diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp
index 5b1dd47340..f2371b14ef 100644
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@@ -106,6 +106,8 @@ void PairLJ96CutGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
index d894d6acf1..dbaef3b929 100644
--- a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
@@ -101,6 +101,8 @@ void PairLJCharmmCoulCharmmGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
index 5153ea0b37..87d4896bde 100644
--- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
@@ -122,6 +122,8 @@ void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_class2_coul_long_gpu.cpp b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
index 2de9586596..90a4682e8f 100644
--- a/src/GPU/pair_lj_class2_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
@@ -120,6 +120,8 @@ void PairLJClass2CoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_class2_gpu.cpp b/src/GPU/pair_lj_class2_gpu.cpp
index 7d7edb773d..9668c1d63a 100644
--- a/src/GPU/pair_lj_class2_gpu.cpp
+++ b/src/GPU/pair_lj_class2_gpu.cpp
@@ -106,6 +106,8 @@ void PairLJClass2GPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cubic_gpu.cpp b/src/GPU/pair_lj_cubic_gpu.cpp
index 4a1316a00a..bec2465b84 100644
--- a/src/GPU/pair_lj_cubic_gpu.cpp
+++ b/src/GPU/pair_lj_cubic_gpu.cpp
@@ -111,6 +111,8 @@ void PairLJCubicGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
index 7bae62ff02..45f98d3ce8 100644
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@@ -109,6 +109,8 @@ void PairLJCutCoulCutGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
index 9c598a7572..86732defb5 100644
--- a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
@@ -112,6 +112,8 @@ void PairLJCutCoulDebyeGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
index 90c8b556dc..08d90b8b57 100644
--- a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
@@ -119,6 +119,8 @@ void PairLJCutCoulDSFGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
index 5094bdc7c9..c70fe555d0 100644
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@@ -122,6 +122,8 @@ void PairLJCutCoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
index c1aaa6323a..aa1fa45ec2 100644
--- a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
@@ -112,6 +112,8 @@ void PairLJCutCoulMSMGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
index cac0582138..b71e526bf2 100644
--- a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
@@ -113,6 +113,8 @@ void PairLJCutDipoleCutGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
index 9489a43389..df1a2d78ba 100644
--- a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
@@ -125,6 +125,8 @@ void PairLJCutDipoleLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
index 422990e1cb..46dd67dc94 100644
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -109,6 +109,8 @@ void PairLJCutGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
index 3830e5dd06..d7eaf4b006 100644
--- a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
@@ -131,6 +131,8 @@ void PairLJCutTIP4PLongGPU::compute(int eflag, int vflag)
                              success, atom->q, atom->nlocal, domain->boxlo, domain->prd);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_lj_expand_coul_long_gpu.cpp b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
index c9ffd0ac23..35cb18122a 100644
--- a/src/GPU/pair_lj_expand_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
@@ -123,6 +123,8 @@ void PairLJExpandCoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp
index 8d7dcf2c21..1e1eac603b 100644
--- a/src/GPU/pair_lj_expand_gpu.cpp
+++ b/src/GPU/pair_lj_expand_gpu.cpp
@@ -107,6 +107,8 @@ void PairLJExpandGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_gromacs_gpu.cpp b/src/GPU/pair_lj_gromacs_gpu.cpp
index 424bce480c..8bb901f961 100644
--- a/src/GPU/pair_lj_gromacs_gpu.cpp
+++ b/src/GPU/pair_lj_gromacs_gpu.cpp
@@ -108,6 +108,8 @@ void PairLJGromacsGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
index 9bd5dc4749..4d8fbb5139 100644
--- a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
+++ b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
@@ -112,6 +112,8 @@ void PairLJSFDipoleSFGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_smooth_gpu.cpp b/src/GPU/pair_lj_smooth_gpu.cpp
index 8ae295282e..5451f4a4f4 100644
--- a/src/GPU/pair_lj_smooth_gpu.cpp
+++ b/src/GPU/pair_lj_smooth_gpu.cpp
@@ -113,6 +113,8 @@ void PairLJSmoothGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_lj_spica_coul_long_gpu.cpp b/src/GPU/pair_lj_spica_coul_long_gpu.cpp
index b315b8cc57..4317c04220 100644
--- a/src/GPU/pair_lj_spica_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_spica_coul_long_gpu.cpp
@@ -125,6 +125,8 @@ void PairLJSPICACoulLongGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     if (evflag) {
diff --git a/src/GPU/pair_lj_spica_gpu.cpp b/src/GPU/pair_lj_spica_gpu.cpp
index 71756a8c26..d531e27284 100644
--- a/src/GPU/pair_lj_spica_gpu.cpp
+++ b/src/GPU/pair_lj_spica_gpu.cpp
@@ -110,6 +110,8 @@ void PairLJSPICAGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     if (evflag) {
diff --git a/src/GPU/pair_mie_cut_gpu.cpp b/src/GPU/pair_mie_cut_gpu.cpp
index 075546588a..0dabf9f3e2 100644
--- a/src/GPU/pair_mie_cut_gpu.cpp
+++ b/src/GPU/pair_mie_cut_gpu.cpp
@@ -107,6 +107,8 @@ void PairMIECutGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp
index b0ac2cce14..570027c1d8 100644
--- a/src/GPU/pair_morse_gpu.cpp
+++ b/src/GPU/pair_morse_gpu.cpp
@@ -105,6 +105,8 @@ void PairMorseGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_resquared_gpu.cpp b/src/GPU/pair_resquared_gpu.cpp
index c0e700c5e6..8c1d1cec17 100644
--- a/src/GPU/pair_resquared_gpu.cpp
+++ b/src/GPU/pair_resquared_gpu.cpp
@@ -35,21 +35,28 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, double **sigma,
-                double **epsilon, int **form, double **host_lj1, double **host_lj2,
-                double **host_lj3, double **host_lj4, double **offset, double *special_lj,
-                const int nlocal, const int nall, const int max_nbors, const int maxspecial,
-                const double cell_size, int &gpu_mode, FILE *screen);
+int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
+                double **sigma, double **epsilon, int **form,
+                double **host_lj1, double **host_lj2, double **host_lj3,
+                double **host_lj4, double **offset, double *special_lj,
+                const int nlocal, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size, int &gpu_mode,
+                FILE *screen);
 void re_gpu_clear();
-int **re_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x,
-                       int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial,
-                       tagint **special, const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, int **ilist, int **jnum,
-                       const double cpu_time, bool &success, double **host_quat);
-int *re_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type,
-                    int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start, const double cpu_time,
-                    bool &success, double **host_quat);
+int **re_gpu_compute_n(const int ago, const int inum, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, const int *ellipsoid,
+                       const void *bonus);
+int *re_gpu_compute(const int ago, const int inum, const int nall,
+                    double **host_x, int *host_type, int *ilist, int *numj,
+                    int **firstneigh, const bool eflag, const bool vflag,
+                    const bool eatom, const bool vatom, int &host_start,
+                    const double cpu_time, bool &success, const int *ellipsoid,
+                    const void *bonus);
 double re_gpu_bytes();
 
 enum { SPHERE_SPHERE, SPHERE_ELLIPSE, ELLIPSE_SPHERE, ELLIPSE_ELLIPSE };
@@ -61,8 +68,6 @@ PairRESquaredGPU::PairRESquaredGPU(LAMMPS *lmp) : PairRESquared(lmp), gpu_mode(G
   reinitflag = 0;
   avec = dynamic_cast<AtomVecEllipsoid *>(atom->style_match("ellipsoid"));
   if (!avec) error->all(FLERR, "Pair resquared/gpu requires atom style ellipsoid");
-  quat_nmax = 0;
-  quat = nullptr;
   suffix_flag |= Suffix::GPU;
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -75,7 +80,6 @@ PairRESquaredGPU::~PairRESquaredGPU()
 {
   re_gpu_clear();
   cpu_time = 0.0;
-  memory->destroy(quat);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -90,21 +94,8 @@ void PairRESquaredGPU::compute(int eflag, int vflag)
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
 
-  if (nall > quat_nmax) {
-    quat_nmax = static_cast<int>(1.1 * nall);
-    memory->grow(quat, quat_nmax, 4, "pair:quat");
-  }
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   int *ellipsoid = atom->ellipsoid;
-  for (int i = 0; i < nall; i++) {
-    int qi = ellipsoid[i];
-    if (qi > -1) {
-      quat[i][0] = bonus[qi].quat[0];
-      quat[i][1] = bonus[qi].quat[1];
-      quat[i][2] = bonus[qi].quat[2];
-      quat[i][3] = bonus[qi].quat[3];
-    }
-  }
 
   if (gpu_mode != GPU_FORCE) {
     double sublo[3], subhi[3];
@@ -120,19 +111,24 @@ void PairRESquaredGPU::compute(int eflag, int vflag)
     }
     inum = atom->nlocal;
     firstneigh =
-        re_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag,
-                         atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom,
-                         host_start, &ilist, &numneigh, cpu_time, success, quat);
+        re_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo,
+                         subhi, atom->tag, atom->nspecial, atom->special,
+                         eflag, vflag, eflag_atom, vflag_atom, host_start,
+                         &ilist, &numneigh, cpu_time, success, ellipsoid,
+                         bonus);
   } else {
     inum = list->inum;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
-    ilist = re_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, list->ilist, numneigh,
-                           firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                           success, quat);
+    ilist = re_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+                           list->ilist, numneigh, firstneigh, eflag, vflag,
+                           eflag_atom, vflag_atom, host_start, cpu_time,
+                           success, ellipsoid, bonus);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
@@ -184,14 +180,13 @@ void PairRESquaredGPU::init_style()
   if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial;
   int mnf = 5e-2 * neighbor->oneatom;
   int success =
-      re_gpu_init(atom->ntypes + 1, shape1, well, cutsq, sigma, epsilon, form, lj1, lj2, lj3, lj4,
-                  offset, force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, mnf,
-                  maxspecial, cell_size, gpu_mode, screen);
+      re_gpu_init(atom->ntypes + 1, shape1, well, cutsq, sigma, epsilon, form,
+                  lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal,
+                  atom->nlocal + atom->nghost, mnf, maxspecial, cell_size,
+                  gpu_mode, screen);
   GPU_EXTRA::check_flag(success, error, world);
 
   if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL);
-  quat_nmax = static_cast<int>(1.1 * (atom->nlocal + atom->nghost));
-  memory->grow(quat, quat_nmax, 4, "pair:quat");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -199,7 +194,7 @@ void PairRESquaredGPU::init_style()
 double PairRESquaredGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
-  return bytes + memory->usage(quat, quat_nmax) + re_gpu_bytes();
+  return bytes + re_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_resquared_gpu.h b/src/GPU/pair_resquared_gpu.h
index 6d79952c39..825655a61d 100644
--- a/src/GPU/pair_resquared_gpu.h
+++ b/src/GPU/pair_resquared_gpu.h
@@ -38,8 +38,6 @@ class PairRESquaredGPU : public PairRESquared {
  private:
   int gpu_mode;
   double cpu_time;
-  int quat_nmax;
-  double **quat;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/GPU/pair_soft_gpu.cpp b/src/GPU/pair_soft_gpu.cpp
index 973e82c13a..9d406d1eaa 100644
--- a/src/GPU/pair_soft_gpu.cpp
+++ b/src/GPU/pair_soft_gpu.cpp
@@ -108,6 +108,8 @@ void PairSoftGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_sw_gpu.cpp b/src/GPU/pair_sw_gpu.cpp
index 67c52e0602..7645218a85 100644
--- a/src/GPU/pair_sw_gpu.cpp
+++ b/src/GPU/pair_sw_gpu.cpp
@@ -114,6 +114,8 @@ void PairSWGPU::compute(int eflag, int vflag)
                    success);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp
index 6615710b8a..ec927a7845 100644
--- a/src/GPU/pair_table_gpu.cpp
+++ b/src/GPU/pair_table_gpu.cpp
@@ -107,6 +107,8 @@ void PairTableGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_tersoff_gpu.cpp b/src/GPU/pair_tersoff_gpu.cpp
index 9f0c8fa883..8610a3880c 100644
--- a/src/GPU/pair_tersoff_gpu.cpp
+++ b/src/GPU/pair_tersoff_gpu.cpp
@@ -118,6 +118,8 @@ void PairTersoffGPU::compute(int eflag, int vflag)
                         cpu_time, success);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_tersoff_mod_gpu.cpp b/src/GPU/pair_tersoff_mod_gpu.cpp
index 15bfc9a85e..1bb09c1403 100644
--- a/src/GPU/pair_tersoff_mod_gpu.cpp
+++ b/src/GPU/pair_tersoff_mod_gpu.cpp
@@ -117,6 +117,8 @@ void PairTersoffMODGPU::compute(int eflag, int vflag)
                             host_start, cpu_time, success);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_tersoff_zbl_gpu.cpp b/src/GPU/pair_tersoff_zbl_gpu.cpp
index 68b0d9dfa7..8d5e05ce4c 100644
--- a/src/GPU/pair_tersoff_zbl_gpu.cpp
+++ b/src/GPU/pair_tersoff_zbl_gpu.cpp
@@ -121,6 +121,8 @@ void PairTersoffZBLGPU::compute(int eflag, int vflag)
                             host_start, cpu_time, success);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_ufm_gpu.cpp b/src/GPU/pair_ufm_gpu.cpp
index 099bfe1e63..d1c099f9fb 100644
--- a/src/GPU/pair_ufm_gpu.cpp
+++ b/src/GPU/pair_ufm_gpu.cpp
@@ -111,6 +111,8 @@ void PairUFMGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_vashishta_gpu.cpp b/src/GPU/pair_vashishta_gpu.cpp
index 0fb0491ad3..38ad2b3c57 100644
--- a/src/GPU/pair_vashishta_gpu.cpp
+++ b/src/GPU/pair_vashishta_gpu.cpp
@@ -116,6 +116,8 @@ void PairVashishtaGPU::compute(int eflag, int vflag)
                           cpu_time, success);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_yukawa_colloid_gpu.cpp b/src/GPU/pair_yukawa_colloid_gpu.cpp
index 8701a9ee80..c1e785380d 100644
--- a/src/GPU/pair_yukawa_colloid_gpu.cpp
+++ b/src/GPU/pair_yukawa_colloid_gpu.cpp
@@ -108,6 +108,8 @@ void PairYukawaColloidGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp
index e2caef9515..b27361e32d 100644
--- a/src/GPU/pair_yukawa_gpu.cpp
+++ b/src/GPU/pair_yukawa_gpu.cpp
@@ -106,6 +106,8 @@ void PairYukawaGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/GPU/pair_zbl_gpu.cpp b/src/GPU/pair_zbl_gpu.cpp
index cbb2c198f7..a1fb3e4663 100644
--- a/src/GPU/pair_zbl_gpu.cpp
+++ b/src/GPU/pair_zbl_gpu.cpp
@@ -108,6 +108,8 @@ void PairZBLGPU::compute(int eflag, int vflag)
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
+  if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
+    neighbor->build_topology();
   if (host_start < inum) {
     cpu_time = platform::walltime();
     cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
diff --git a/src/MAKE/OPTIONS/Makefile.oneapi b/src/MAKE/OPTIONS/Makefile.oneapi
index d34f0900c6..7f450d5340 100644
--- a/src/MAKE/OPTIONS/Makefile.oneapi
+++ b/src/MAKE/OPTIONS/Makefile.oneapi
@@ -6,16 +6,16 @@ SHELL = /bin/sh
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		mpiicpc -std=c++11
-OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-qopenmp -qopenmp-simd -qno-offload -ansi-alias -restrict \
+CC =		mpiicpc -cxx=icpx -std=c++11
+OPTFLAGS =      -xHost -O2 -ffast-math -freciprocal-math
+CCFLAGS =	-qopenmp-simd -qopenmp -ansi-alias \
                 -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) \
                 -I$(MKLROOT)/include
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		mpiicpc -std=c++11
-LINKFLAGS =	-qopenmp -qopenmp-simd $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/
+LINK =		mpiicpc -cxx=icpx -std=c++11
+LINKFLAGS =	-qopenmp-simd -qopenmp $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/
 LIB =           -ltbbmalloc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core	
 SIZE =		size
 
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index 05371c8259..90e0a81fd0 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -291,6 +291,10 @@ Neighbor::~Neighbor()
 
 void Neighbor::init()
 {
+  #ifdef LMP_GPU
+  overlap_topo = 0;
+  #endif
+
   int i,j,n;
 
   ncalls = ndanger = 0;
@@ -2434,7 +2438,13 @@ void Neighbor::build(int topoflag)
 
   // build topology lists for bonds/angles/etc
 
+  #ifdef LMP_GPU
+  if (overlap_topo == 0) {
+    if ((atom->molecular != Atom::ATOMIC) && topoflag) build_topology();
+  }
+  #else
   if ((atom->molecular != Atom::ATOMIC) && topoflag) build_topology();
+  #endif
 }
 
 /* ----------------------------------------------------------------------
@@ -2817,6 +2827,17 @@ int Neighbor::exclude_setting()
   return exclude;
 }
 
+/* ----------------------------------------------------------------------
+   If nonzero, call build_topology from GPU styles instead to overlap comp
+------------------------------------------------------------------------- */
+
+#ifdef LMP_GPU
+void Neighbor::set_overlap_topo(int s)
+{
+  overlap_topo = s;
+}
+#endif
+
 /* ----------------------------------------------------------------------
    check if any of the old requested neighbor lists are full
 ------------------------------------------------------------------------- */
diff --git a/src/neighbor.h b/src/neighbor.h
index 241f44be06..dd638880c7 100644
--- a/src/neighbor.h
+++ b/src/neighbor.h
@@ -155,6 +155,12 @@ class Neighbor : protected Pointers {
   void exclusion_group_group_delete(int, int);    // rm a group-group exclusion
   int exclude_setting();                          // return exclude value to accelerator pkg
 
+  // Option to call build_topology from gpu styles instead for overlapped comp
+  #ifdef LMP_GPU
+  void set_overlap_topo(int);
+  int overlap_topo; // 0 for default/old non-overlap mode
+  #endif
+
   // find a neighbor list based on requestor
   NeighList *find_list(void *, const int id = 0) const;
   // find a neighbor request based on requestor