From 209999068ced41194ffdb3b1b1ee7837eb3478fc Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 21 Sep 2012 15:57:23 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8810
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Nvidia.makefile        |   2 +-
 lib/gpu/Opencl.makefile        |  31 ++--
 lib/gpu/lal_atom.cu            |   6 +-
 lib/gpu/lal_aux_fun1.h         |  71 +++++++++
 lib/gpu/lal_base_atomic.cpp    |   2 +-
 lib/gpu/lal_base_charge.cpp    |   2 +-
 lib/gpu/lal_base_dipole.cpp    |   2 +-
 lib/gpu/lal_base_ellipsoid.cpp |   7 +-
 lib/gpu/lal_born.cu            |  43 +++---
 lib/gpu/lal_born_coul_long.cu  |  43 +++---
 lib/gpu/lal_born_coul_wolf.cu  |  93 +++++++-----
 lib/gpu/lal_buck.cu            |  39 +++--
 lib/gpu/lal_buck_coul.cu       |  47 +++---
 lib/gpu/lal_buck_coul_long.cu  |  54 ++++---
 lib/gpu/lal_cg_cmm.cu          |  39 +++--
 lib/gpu/lal_cg_cmm_long.cu     |  49 +++---
 lib/gpu/lal_charmm_long.cu     |  58 ++++---
 lib/gpu/lal_colloid.cu         |  76 ++++++----
 lib/gpu/lal_coul_dsf.cu        |  52 +++++--
 lib/gpu/lal_coul_long.cu       | 247 +++++++++++++++---------------
 lib/gpu/lal_device.cpp         |   5 +-
 lib/gpu/lal_device.cu          |   4 +-
 lib/gpu/lal_device.h           |   4 +-
 lib/gpu/lal_dipole_lj.cu       | 120 ++++++++++-----
 lib/gpu/lal_dipole_lj_sf.cu    | 122 ++++++++++-----
 lib/gpu/lal_eam.cpp            |  16 +-
 lib/gpu/lal_eam.cu             | 186 ++++++++++++++++-------
 lib/gpu/lal_eam.h              |  10 +-
 lib/gpu/lal_eam_ext.cpp        |  14 +-
 lib/gpu/lal_ellipsoid_extra.h  | 131 +++++++++-------
 lib/gpu/lal_ellipsoid_nbor.cu  |  30 ++--
 lib/gpu/lal_gauss.cu           |  28 ++--
 lib/gpu/lal_gayberne.cu        |  23 ++-
 lib/gpu/lal_gayberne_lj.cu     |  65 +++++---
 lib/gpu/lal_lj.cu              |  39 +++--
 lib/gpu/lal_lj96.cu            |  39 +++--
 lib/gpu/lal_lj_class2_long.cu  |  51 ++++---
 lib/gpu/lal_lj_coul.cu         |  49 +++---
 lib/gpu/lal_lj_coul_debye.cu   |  53 ++++---
 lib/gpu/lal_lj_coul_long.cu    |  49 +++---
 lib/gpu/lal_lj_dsf.cu          |  58 ++++---
 lib/gpu/lal_lj_expand.cu       |  33 ++--
 lib/gpu/lal_morse.cu           |  39 +++--
 lib/gpu/lal_neighbor_cpu.cu    |   7 +-
 lib/gpu/lal_neighbor_gpu.cu    |  26 ++--
 lib/gpu/lal_pppm.cu            |  27 ++--
 lib/gpu/lal_preprocessor.h     |  22 ++-
 lib/gpu/lal_re_squared.cu      |  24 +--
 lib/gpu/lal_re_squared_lj.cu   | 270 +++++++++++++++++++++------------
 lib/gpu/lal_table.cu           | 230 +++++++++++++++-------------
 lib/gpu/lal_yukawa.cu          |  37 +++--
 lib/gpu/lal_yukawa_colloid.cpp |   2 +-
 lib/gpu/lal_yukawa_colloid.cu  |  38 +++--
 53 files changed, 1733 insertions(+), 1081 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 31c687369a..495ff7d054 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -87,7 +87,7 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
        $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
        $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
-       $(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \
+       $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul_cubin.h \
        $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
        $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
        $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 51bd78fbd7..4ace9bd3c1 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -64,16 +64,11 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
        $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
        $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
-       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
-       $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
-       $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
-       $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
-       $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
-       $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
-       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
-       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
-       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
-       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h
+       $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h \
+       $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \
+       $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \
+       $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \
+       $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h
 
 
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@@ -131,11 +126,11 @@ $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
 $(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
 	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
 
-$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
+$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
 
-$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
+$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
 
 $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(OCL) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
@@ -143,11 +138,11 @@ $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/g
 $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
 	$(OCL) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
+$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
 
-$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h
-	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
+$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
 
 $(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o
 	$(OCL) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)
diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu
index 3446c1d4e0..2a78719ffb 100644
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@@ -17,8 +17,10 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
-                            __global int *type, const int nall) {
+__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, 
+                            const __global double *restrict x,
+                            const __global int *restrict type, 
+                            const int nall) {
   int ii=GLOBAL_ID_X;
 
   if (ii<nall) {
diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h
index 7eb7a3426f..92a03eb7d4 100644
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@@ -41,6 +41,8 @@
     nbor+=offset;                                                            \
   }
 
+#if (ARCH < 300)
+
 #define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
                       eflag, vflag, ans, engv)                              \
   if (t_per_atom>1) {                                                       \
@@ -137,3 +139,72 @@
     ans[ii]=f;                                                              \
   }
 
+#else
+
+#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
+                      eflag, vflag, ans, engv)                              \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
+      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
+      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
+      energy += shfl_xor(energy, s, t_per_atom);                            \
+      e_coul += shfl_xor(e_coul, s, t_per_atom);                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#endif
+
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index f88c4417af..6597dbfe98 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -132,7 +132,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist,
   resize_atom(inum,nall,success);
   resize_local(inum,mn,success);
   if (!success)
-    return false;
+    return NULL;
 
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
 
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index f9bb2a52f3..f61950cfee 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -135,7 +135,7 @@ int * BaseChargeT::reset_nbors(const int nall, const int inum, int *ilist,
   resize_atom(inum,nall,success);
   resize_local(inum,mn,success);
   if (!success)
-    return false;
+    return NULL;
 
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
 
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 7c090e6351..8c793f554e 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -137,7 +137,7 @@ int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist,
   resize_atom(inum,nall,success);
   resize_local(inum,mn,success);
   if (!success)
-    return false;
+    return NULL;
 
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
 
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 7e86d03e50..2b45b109db 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -80,7 +80,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   ucl_device=device->gpu;
   atom=&device->atom;
 
-  _block_size=device->pair_block_size();
+  _block_size=device->block_ellipse();
   compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
 
   // Initialize host-device load balancer
@@ -118,9 +118,8 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
     ans->force.zero();
 
   // Memory for ilist ordered by particle type
-  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
-    return 0;
-  else return -3;
+  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
+    return -3;
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index 3410a5effd..f04b79e142 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -24,14 +24,18 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, 
-                          __global numtyp2 *cutsq_sigma,
-                          const int lj_types, __global numtyp *sp_lj_in, 
-                          __global int *dev_nbor, __global int *dev_packed, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_born(const __global numtyp4 *restrict x_, 
+                     const __global numtyp4 *restrict coeff1,
+                     const __global numtyp4 *restrict coeff2, 
+                     const __global numtyp2 *restrict cutsq_sigma,
+                     const int lj_types, 
+                     const __global numtyp *restrict sp_lj_in, 
+                     const __global int *dev_nbor,
+                     const __global int *dev_packed, 
+                     __global acctyp4 *restrict ans,
+                     __global acctyp *restrict engv, 
+                     const int eflag, const int vflag, const int inum,
+                     const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -49,7 +53,7 @@ __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -108,14 +112,17 @@ __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp2 *cutsq_sigma,
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_born_fast(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1_in,
+                          const __global numtyp4 *restrict coeff2_in, 
+                          const __global numtyp2 *restrict cutsq_sigma,
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans, 
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -140,7 +147,7 @@ __kernel void k_born_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index 601f7341cf..a602729baa 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -29,14 +29,19 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          __global numtyp4 *cutsq_sigma, 
+__kernel void k_born_long(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict cutsq_sigma, 
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -61,7 +66,7 @@ __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -143,14 +148,18 @@ __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_born_long_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *cutsq_sigma,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -178,7 +187,7 @@ __kernel void k_born_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index 7461569db5..07b2cdd5c8 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -31,14 +31,19 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          __global numtyp4 *cutsq_sigma, 
+__kernel void k_born_wolf(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict cutsq_sigma, 
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp alf, const numtyp e_shift, 
                           const numtyp f_shift, const int t_per_atom) {
@@ -64,7 +69,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -74,7 +79,8 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
     int itype=ix.w;
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
 
@@ -83,7 +89,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -98,7 +104,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
       int mtype=itype*lj_types+jtype;
       if (rsq<cutsq_sigma[mtype].x) { // cutsq
         numtyp r2inv = ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
         
@@ -113,14 +119,14 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
         if (rsq < cut_coulsq) {
           numtyp r=ucl_rsqrt(r2inv);
           numtyp arij = alf * r;
-          numtyp expm2 = ucl_exp(-arij*arij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          numtyp erfcd = ucl_exp(-arij*arij);
           fetch(prefactor,j,q_tex);
           prefactor *= qqrd2e * qtmp/r;
-          v_sh = (_erfc - e_shift*r)*prefactor; 
-          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
-          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
+
+          erfcc = erfc(arij);
+          v_sh = (erfcc - e_shift*r)*prefactor;
+          numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = prefactor * dvdrr*rsq*factor_coul;
         } else forcecoul = (numtyp)0.0;
 
         force = (forceborn + forcecoul) * r2inv;
@@ -131,8 +137,8 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
 
         if (eflag>0) {
           if (rsq < cut_coulsq)
-            e_coul += prefactor*(v_sh-factor_coul);
-          if (rsq < coeff1[mtype].w) {
+            e_coul += v_sh*factor_coul;
+          if (rsq < cutsq_sigma[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
               + coeff2[mtype].z*r2inv*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].w);
@@ -154,14 +160,18 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *cutsq_sigma,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp alf, const numtyp e_shift, 
                                const numtyp f_shift, const int t_per_atom) {
@@ -190,7 +200,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -201,7 +211,8 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     if (eflag>0) {
-      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
     }
 
@@ -210,7 +221,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
 
       numtyp factor_lj, factor_coul;
       factor_lj = sp_lj[sbmask(j)];
-      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      factor_coul = sp_lj[sbmask(j)+4];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -224,11 +235,11 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
 
       if (rsq<cutsq_sigma[mtype].x) {
         numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc;
+        numtyp forcecoul, forceborn, force, r6inv, prefactor, erfcc;
         numtyp v_sh = (numtyp)0.0;
         numtyp rexp = (numtyp)0.0;
         
-        if (rsq < coeff1[mtype].w) {
+        if (rsq < cutsq_sigma[mtype].y) {
           numtyp r = ucl_sqrt(rsq);
           rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
           r6inv = r2inv*r2inv*r2inv;
@@ -237,16 +248,16 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
         } else forceborn = (numtyp)0.0;
 
         if (rsq < cut_coulsq) {
-          numtyp r=ucl_rsqrt(r2inv);
+          numtyp r=ucl_sqrt(rsq);
           numtyp arij = alf * r;
-          numtyp expm2 = ucl_exp(-arij*arij);
-          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*arij);
-          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          numtyp erfcd = ucl_exp(-arij*arij);
           fetch(prefactor,j,q_tex);
           prefactor *= qqrd2e * qtmp/r;
-          v_sh = (_erfc - e_shift*r)*prefactor; 
-          numtyp dvdrr = (_erfc/rsq + EWALD_F*alf*expm2/r) + f_shift;
-          forcecoul = prefactor * (dvdrr*rsq-factor_coul);
+
+          erfcc = erfc(arij);
+          v_sh = (erfcc - e_shift*r)*prefactor;
+          numtyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = prefactor * dvdrr*rsq*factor_coul;
         } else forcecoul = (numtyp)0.0;
 
         force = (forceborn + forcecoul) * r2inv;
@@ -257,8 +268,8 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
 
         if (eflag>0) {
           if (rsq < cut_coulsq)
-            e_coul += prefactor*(v_sh-factor_coul);
-          if (rsq < coeff1[mtype].w) {
+            e_coul += v_sh*factor_coul;
+          if (rsq < cutsq_sigma[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
               + coeff2[mtype].z*r2inv*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].w);
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index b0c817ad35..8da8587f58 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_buck(const __global numtyp4 *restrict x_, 
+                     const __global numtyp4 *restrict coeff1,
+                     const __global numtyp4 *restrict coeff2, 
+                     const int lj_types, 
+                     const __global numtyp *restrict sp_lj_in,
+                     const __global int *dev_nbor, 
+                     const __global int *dev_packed,
+                     __global acctyp4 *restrict ans,
+                     __global acctyp *restrict engv, 
+                     const int eflag,  const int vflag, const int inum,
+                     const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -48,7 +52,7 @@ __kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -106,13 +110,16 @@ __kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_buck_fast(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1_in,
+                          const __global numtyp4 *restrict coeff2_in, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -137,7 +144,7 @@ __kernel void k_buck_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 45cc36b0ce..e4febf5578 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          __global numtyp4 *cutsq, const numtyp qqrd2e,
-                          const int t_per_atom) {
+__kernel void k_buck_coul(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_ ,
+                          const __global numtyp4 *restrict cutsq, 
+                          const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -60,7 +65,7 @@ __kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -136,15 +141,19 @@ __kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed, 
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *_cutsq, const numtyp qqrd2e, 
-                               const int t_per_atom) {
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict _cutsq, 
+                               const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -172,7 +181,7 @@ __kernel void k_buck_coul_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index 9ab3ed32c5..ea59de17dd 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -29,16 +29,21 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
-                          __global numtyp4* coeff2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          __global numtyp *cutsq, 
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict coeff1,
+                               const __global numtyp4 *restrict coeff2, 
+                               const int lj_types, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv, 
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp *restrict cutsq, 
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -61,7 +66,7 @@ __kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -144,16 +149,21 @@ __kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1,
   } // if ii
 }
 
-__kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in,
-                               __global numtyp4* coeff2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp *cutsq,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_, 
+                                    const __global numtyp4 *restrict coeff1_in,
+                                    const __global numtyp4 *restrict coeff2_in, 
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor, 
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv, 
+                                    const int eflag, const int vflag, 
+                                    const int inum, const int nbor_pitch,
+                                    const __global numtyp *restrict q_, 
+                                    const __global numtyp *restrict cutsq,
+                                    const numtyp cut_coulsq, 
+                                    const numtyp qqrd2e, const numtyp g_ewald, 
+                                    const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -179,7 +189,7 @@ __kernel void k_buck_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *coef
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_cg_cmm.cu
index 4543320cc7..bce2904664 100644
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_cg_cmm.cu
@@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, 
+                       const __global numtyp4 *restrict lj1,
+                       const __global numtyp4 *restrict lj3, 
+                       const int lj_types, 
+                       const __global numtyp *restrict sp_lj_in,
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed,
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int vflag, const int inum,
+                       const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -48,7 +52,7 @@ __kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -111,13 +115,16 @@ __kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,__global int *dev_nbor,
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum,
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1_in,
+                            const __global numtyp4 *restrict lj3_in, 
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag, const int inum,
+                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -142,7 +149,7 @@ __kernel void k_cg_cmm_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_cg_cmm_long.cu
index 2f1d9f2d21..487eb10355 100644
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_cg_cmm_long.cu
@@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1,
+                            const __global numtyp4 *restrict lj3, 
+                            const int lj_types, 
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor, 
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag,  const int vflag, const int inum,
+                            const int nbor_pitch,
+                            const __global numtyp *restrict q_ ,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -60,7 +65,7 @@ __kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -148,15 +153,19 @@ __kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_, 
+                                 const __global numtyp4 *restrict lj1_in,
+                                 const __global numtyp4 *restrict lj3_in, 
+                                 const __global numtyp *restrict sp_lj_in, 
+                                 const __global int *dev_nbor, 
+                                 const __global int *dev_packed,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv, 
+                                 const int eflag, const int vflag, 
+                                 const int inum, const int nbor_pitch,
+                                 const __global numtyp *restrict q_, 
+                                 const numtyp cut_coulsq, const numtyp qqrd2e,
+                                 const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -181,7 +190,7 @@ __kernel void k_cg_cmm_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index 9b884e473a..0f7b925f69 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -29,16 +29,21 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, __global numtyp *sp_lj_in,
-                          __global int *dev_nbor, __global int *dev_packed,
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const numtyp denom_lj,
-                          const numtyp cut_bothsq, const numtyp cut_ljsq,
-                          const numtyp cut_lj_innersq, const int t_per_atom) {
+__kernel void k_charmm_long(const __global numtyp4 *restrict x_,
+                            const __global numtyp4 *restrict lj1,
+                            const int lj_types, 
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv, 
+                            const int eflag, const int vflag, const int inum, 
+                            const int nbor_pitch, 
+                            const __global numtyp *restrict q_,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp g_ewald, const numtyp denom_lj,
+                            const numtyp cut_bothsq, const numtyp cut_ljsq,
+                            const numtyp cut_lj_innersq, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -61,7 +66,7 @@ __kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -152,17 +157,21 @@ __kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum,
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const numtyp denom_lj,
-                               const numtyp cut_bothsq, const numtyp cut_ljsq, 
-                               const numtyp cut_lj_innersq,
-                               const int t_per_atom) {
+__kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_, 
+                                 const __global numtyp2 *restrict ljd_in,
+                                 const __global numtyp *restrict sp_lj_in, 
+                                 const __global int *dev_nbor, 
+                                 const __global int *dev_packed, 
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv, 
+                                 const int eflag, const int vflag, 
+                                 const int inum, const int nbor_pitch, 
+                                 const __global numtyp *restrict q_,
+                                 const numtyp cut_coulsq, const numtyp qqrd2e,
+                                 const numtyp g_ewald, const numtyp denom_lj,
+                                 const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                                 const numtyp cut_lj_innersq,
+                                 const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -170,7 +179,8 @@ __kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
   __local numtyp sp_lj[8];
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
-  ljd[tid]=ljd_in[tid];
+  if (tid<MAX_BIO_SHARED_TYPES)
+    ljd[tid]=ljd_in[tid];
   if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
     ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
   
@@ -185,7 +195,7 @@ __kernel void k_charmm_long_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index 74e249083d..e918feca8b 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -24,16 +24,19 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
-                        __global numtyp4* lj3, const int lj_types, 
-                        __global numtyp *sp_lj_in, 
-                        __global numtyp4* colloid1, 
-                        __global numtyp4* colloid2,
-                        __global int *form, 
-                        __global int *dev_nbor, 
-                        __global int *dev_packed, __global acctyp4 *ans,
-                        __global acctyp *engv, const int eflag, 
-                        const int vflag, const int inum,
+__kernel void k_colloid(const __global numtyp4 *restrict x_, 
+                        const __global numtyp4 *restrict lj1,
+                        const __global numtyp4 *restrict lj3, 
+                        const int lj_types, 
+                        const __global numtyp *restrict sp_lj_in, 
+                        const __global numtyp4 *restrict colloid1, 
+                        const __global numtyp4 *restrict colloid2,
+                        const __global int *form, 
+                        const __global int *dev_nbor, 
+                        const __global int *dev_packed, 
+                        __global acctyp4 *restrict ans,
+                        __global acctyp *restrict engv, 
+                        const int eflag, const int vflag, const int inum,
                         const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -52,7 +55,7 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -98,9 +101,10 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
           K[3] *= K[3]*K[3];
           K[6] = K[3]*K[3];
           fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
-          force = 4.0/15.0*fR * 
-            (2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) * 
-          colloid2[mtype].w/K[6]-5.0) / K[0];
+          force = (numtyp)4.0/(numtyp)15.0*fR * 
+             ((numtyp)2.0*(K[1]+K[2]) * 
+             (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+             colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
           force*=factor_lj;
         } else if (form[mtype]==2) { // LARGE_LARGE
           r = ucl_sqrt(rsq);
@@ -131,8 +135,9 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
           fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
           evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
           numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
-          numtyp dUA = -colloid1[mtype].x/3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
-            ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
+          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
+                       (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+                       ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
           force = factor_lj * (dUR+dUA)/r;
         }
   
@@ -149,7 +154,8 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
               ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2]) +  
               (numtyp)4.2*K[4])+K[2]*K[4]) * colloid2[mtype].w/K[6]);
           } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+              ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
           } 
           energy+=factor_lj*(e-lj3[mtype].z); 
         }
@@ -169,15 +175,17 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_colloid_fast(__global numtyp4 *x_, 
-                             __global numtyp4 *lj1_in,
-                             __global numtyp4 *lj3_in, 
-                             __global numtyp *sp_lj_in,
-                             __global numtyp4 *colloid1_in, 
-                             __global numtyp4 *colloid2_in,
-                             __global int *form_in, 
-                             __global int *dev_nbor, __global int *dev_packed, 
-                             __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_colloid_fast(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1_in,
+                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global numtyp4 *restrict colloid1_in, 
+                             const __global numtyp4 *restrict colloid2_in,
+                             const __global int *form_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans, 
+                             __global acctyp *restrict engv, 
                              const int eflag, const int vflag, const int inum, 
                              const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -210,7 +218,7 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -257,7 +265,8 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
           K[6] = K[3]*K[3];
           fR = colloid2[mtype].z*colloid1[mtype].x*c2*K[1]/K[3];
           force = (numtyp)4.0/(numtyp)15.0*fR * 
-            ((numtyp)2.0*(K[1]+K[2]) * (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
+            ((numtyp)2.0*(K[1]+K[2]) * 
+            (K[1]*((numtyp)5.0*K[1]+(numtyp)22.0*K[2])+(numtyp)5.0*K[4]) * 
             colloid2[mtype].w/K[6]-(numtyp)5.0) / K[0];
           force*=factor_lj;
         } else if (form[mtype]==2) { // LARGE_LARGE
@@ -289,7 +298,8 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
           fR = colloid1[mtype].x*colloid2[mtype].w/r/(numtyp)37800.0;
           evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
           numtyp dUR = evdwl/r + (numtyp)5.0*fR*(g[0]+g[1]-g[2]-g[3]);
-          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*(((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
+          numtyp dUA = -colloid1[mtype].x/(numtyp)3.0*r*
+            (((numtyp)2.0*K[0]*K[7]+(numtyp)1.0)*K[7] + 
             ((numtyp)2.0*K[0]*K[8]-(numtyp)1.0)*K[8]);
           force = factor_lj * (dUR+dUA)/r;
         } else force = (numtyp)0.0;
@@ -304,10 +314,12 @@ __kernel void k_colloid_fast(__global numtyp4 *x_,
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           } else if (form[mtype]==1) {
             e=(numtyp)2.0/(numtyp)9.0*fR * 
-              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+(numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
-             colloid2[mtype].w/K[6]);
+              ((numtyp)1.0-(K[1]*(K[1]*(K[1]/(numtyp)3.0+
+              (numtyp)3.0*K[2])+(numtyp)4.2*K[4])+K[2]*K[4])*
+              colloid2[mtype].w/K[6]);
           } else if (form[mtype]==2) {
-            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
+            e=evdwl+colloid1[mtype].x/(numtyp)6.0 * 
+              ((numtyp)2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7]));
           } 
           energy+=factor_lj*(e-lj3[mtype].z); 
         }
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index 3ce1eccab3..f013fe663f 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -31,12 +31,16 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types, 
-                         __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                         __global int *dev_packed, __global acctyp4 *ans,
-                         __global acctyp *engv, const int eflag,
-                         const int vflag, const int inum,
-                         const int nbor_pitch, __global numtyp *q_ ,
+__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, 
+                         const int lj_types, 
+                         const __global numtyp *restrict sp_lj_in, 
+                         const __global int *dev_nbor, 
+                         const __global int *dev_packed, 
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv, 
+                         const int eflag, const int vflag, const int inum,
+                         const int nbor_pitch, 
+                         const __global numtyp *restrict q_ ,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp e_shift, const numtyp f_shift, 
                          const numtyp alpha, const int t_per_atom) {
@@ -58,7 +62,7 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -66,6 +70,12 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
@@ -91,8 +101,8 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                 rsq*f_shift);
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          rsq*f_shift);
         
         force = forcecoul * r2inv;
 
@@ -122,11 +132,15 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types,
   } // if ii
 }
 
-__kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
-                              __global int *dev_nbor, __global int *dev_packed,
-                              __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_, 
+                              const __global numtyp *restrict sp_lj_in,
+                              const __global int *dev_nbor, 
+                              const __global int *dev_packed,
+                              __global acctyp4 *restrict ans, 
+                              __global acctyp *restrict engv, 
                               const int eflag, const int vflag, const int inum, 
-                              const int nbor_pitch, __global numtyp *q_,
+                              const int nbor_pitch, 
+                              const __global numtyp *restrict q_,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp e_shift, const numtyp f_shift, 
                               const numtyp alpha, const int t_per_atom) {
@@ -148,13 +162,19 @@ __kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
+
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
  
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
@@ -181,8 +201,8 @@ __kernel void k_coul_dsf_fast(__global numtyp4 *x_, __global numtyp* sp_lj_in,
         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-        forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                   rsq*f_shift);
+        forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+          rsq*f_shift);
         
         force = forcecoul * r2inv;
 
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index b93010c959..c59c0a6899 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -29,13 +29,112 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types,
-                          __global numtyp *sp_cl_in, __global int *dev_nbor,
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
+#if (ARCH < 300)
+
+#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
+                                                                            \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=e_coul;                                                 \
+                                                                            \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<4; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    e_coul=red_acc[3][tid];                                                 \
+                                                                            \
+    if (vflag>0) {                                                          \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+                                                                            \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<6; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+    }                                                                       \
+  }                                                                         \
+                                                                            \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=(acctyp)0;                                                       \
+      ap1+=inum;                                                            \
+      *ap1=e_coul;                                                          \
+      ap1+=inum;                                                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i];                                                     \
+        ap1+=inum;                                                          \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#else
+
+#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=(acctyp)0;                                                       \
+      ap1+=inum;                                                            \
+      *ap1=e_coul;                                                          \
+      ap1+=inum;                                                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i];                                                     \
+        ap1+=inum;                                                          \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#endif
+
+__kernel void k_coul_long(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
+                          const int lj_types,
+                          const __global numtyp *restrict sp_cl_in, 
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -55,7 +154,7 @@ __kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -109,71 +208,22 @@ __kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
-
-    // Reduce answers
-    if (t_per_atom>1) {
-      __local acctyp red_acc[6][BLOCK_PAIR];
-
-      red_acc[0][tid]=f.x;
-      red_acc[1][tid]=f.y;
-      red_acc[2][tid]=f.z;
-      red_acc[3][tid]=e_coul;
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<4; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-
-      f.x=red_acc[0][tid];
-      f.y=red_acc[1][tid];
-      f.z=red_acc[2][tid];
-      e_coul=red_acc[3][tid];
-
-      if (vflag>0) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid]=virial[r];
-
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<6; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-
-        for (int r=0; r<6; r++)
-          virial[r]=red_acc[r][tid];
-      }
-    }
-
-    // Store answers
-    if (offset==0) {
-      __global acctyp *ap1=engv+ii;
-      if (eflag>0) {
-        *ap1=(acctyp)0;
-        ap1+=inum;
-        *ap1=e_coul;
-        ap1+=inum;
-      }
-      if (vflag>0) {
-        for (int i=0; i<6; i++) {
-          *ap1=virial[i];
-          ap1+=inum;
-        }
-      }
-      ans[ii]=f;
-    }
+    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
   } // if ii
 }
 
-__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in,
-                               __global numtyp* sp_cl_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv,
+__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict sp_cl_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv,
                                const int eflag, const int vflag, const int inum,
-                               const int nbor_pitch, __global numtyp *q_,
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
@@ -193,7 +243,7 @@ __kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
 
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -247,61 +297,8 @@ __kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
-
-    // Reduce answers
-    if (t_per_atom>1) {
-      __local acctyp red_acc[6][BLOCK_PAIR];
-
-      red_acc[0][tid]=f.x;
-      red_acc[1][tid]=f.y;
-      red_acc[2][tid]=f.z;
-      red_acc[3][tid]=e_coul;
-
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<4; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-
-      f.x=red_acc[0][tid];
-      f.y=red_acc[1][tid];
-      f.z=red_acc[2][tid];
-      e_coul=red_acc[3][tid];
-
-      if (vflag>0) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid]=virial[r];
-
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<6; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-
-        for (int r=0; r<6; r++)
-          virial[r]=red_acc[r][tid];
-      }
-    }
-
-    // Store answers
-    if (offset==0) {
-      __global acctyp *ap1=engv+ii;
-      if (eflag>0) {
-        *ap1=(acctyp)0;
-        ap1+=inum;
-        *ap1=e_coul;
-        ap1+=inum;
-      }
-      if (vflag>0) {
-        for (int i=0; i<6; i++) {
-          *ap1=virial[i];
-          ap1+=inum;
-        }
-      }
-      ans[ii]=f;
-    }
+    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
   } // if ii
 }
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 3952495393..979ff2cbed 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -580,14 +580,14 @@ int DeviceT::compile_kernels() {
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
 
-  UCL_Vector<int,int> gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
+  UCL_Vector<int,int> gpu_lib_data(15,*gpu,UCL_NOT_PINNED);
   k_info.set_size(1,1);
   k_info.run(&gpu_lib_data);
   gpu_lib_data.update_host(false);
   
   _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
   #ifndef USE_OPENCL
-  if (_ptx_arch>gpu->arch())
+  if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
     return -4;
   #endif
 
@@ -606,6 +606,7 @@ int DeviceT::compile_kernels() {
   _block_nbor_build=gpu_lib_data[10];
   _block_bio_pair=gpu_lib_data[11];
   _max_bio_shared_types=gpu_lib_data[12];
+  _block_ellipse=gpu_lib_data[14];
 
   if (static_cast<size_t>(_block_pair)>gpu->group_size())
     _block_pair=gpu->group_size();
diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu
index 54a95417be..28b58f7760 100644
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@@ -17,7 +17,8 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_zero(__global int *mem, int numel) {
+__kernel void kernel_zero(__global int *restrict mem, 
+                          int numel) {
   int ii=GLOBAL_ID_X;
   
   if (ii<numel)
@@ -39,4 +40,5 @@ __kernel void kernel_info(__global int *info) {
   info[11]=BLOCK_BIO_PAIR;
   info[12]=MAX_BIO_SHARED_TYPES;
   info[13]=THREADS_PER_CHARGE;
+  info[14]=BLOCK_ELLIPSE;
 }
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 6cfad82054..62b0d862bd 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -228,6 +228,8 @@ class Device {
   inline int block_nbor_build() const { return _block_nbor_build; }
   /// Return the block size for "bio" pair styles
   inline int block_bio_pair() const { return _block_bio_pair; }
+  /// Return the block size for "ellipse" pair styles
+  inline int block_ellipse() const { return _block_ellipse; }
   /// Return the maximum number of atom types for shared mem with "bio" styles
   inline int max_bio_shared_types() const { return _max_bio_shared_types; }
   /// Architecture gpu code compiled for (returns 0 for OpenCL)
@@ -292,7 +294,7 @@ class Device {
 
   int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
   int _pppm_max_spline, _pppm_block;
-  int _block_pair, _max_shared_types;
+  int _block_pair, _block_ellipse, _max_shared_types;
   int _block_cell_2d, _block_cell_id, _block_nbor_build;
   int _block_bio_pair, _max_bio_shared_types;
 
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index 6ab0b3037d..620f904eba 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -15,6 +15,23 @@
 
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+texture<float4> mu_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+texture<int4,1> mu_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#define mu_tex mu_
+#endif
+
+#if (ARCH < 300)
 
 #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
@@ -73,32 +90,63 @@
     ans[ii+inum]=tor;                                                       \
   }
 
-#ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
 #endif
 
-#else
-#define pos_tex x_
-#define q_tex q_
-#define mu_tex mu_
-#endif
-
-__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                           __global numtyp4 *mu_,
-                          __global numtyp *cutsq, const numtyp qqrd2e,
-                          const int t_per_atom) {
+__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch, 
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict mu_,
+                          const __global numtyp *restrict cutsq,
+                          const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -125,7 +173,7 @@ __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -291,16 +339,20 @@ __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp4 *mu_,
-                               __global numtyp *_cutsq, const numtyp qqrd2e,
-                               const int t_per_atom) {
+                               const int nbor_pitch, 
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict mu_,
+                               const __global numtyp *restrict _cutsq, 
+                               const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -332,7 +384,7 @@ __kernel void k_dipole_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index 16d5463cb1..83024e705c 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -16,6 +16,24 @@
 #ifdef NV_KERNEL
 #include "lal_aux_fun1.h"
 
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+texture<float4> mu_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+texture<int4,1> mu_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#define mu_tex mu_
+#endif
+
+#if (ARCH < 300)
+
 #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
   if (t_per_atom>1) {                                                       \
@@ -73,32 +91,63 @@
     ans[ii+inum]=tor;                                                       \
   }
 
-#ifndef _DOUBLE_DOUBLE
-texture<float4> pos_tex;
-texture<float> q_tex;
-texture<float4> mu_tex;
 #else
-texture<int4,1> pos_tex;
-texture<int2> q_tex;
-texture<int4,1> mu_tex;
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv=energy;                                                         \
+      engv+=inum;                                                           \
+      *engv=e_coul;                                                         \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
 #endif
 
-#else
-#define pos_tex x_
-#define q_tex q_
-#define mu_tex mu_
-#endif
-
-__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
-                             __global numtyp4* lj3, const int lj_types, 
-                             __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                             __global int *dev_packed, __global acctyp4 *ans,
-                             __global acctyp *engv, const int eflag,
-                             const int vflag, const int inum,
-                             const int nbor_pitch, __global numtyp *q_ ,
-                             __global numtyp4 *mu_,
-                             __global numtyp *cutsq, const numtyp qqrd2e,
-                             const int t_per_atom) {
+__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1,
+                             const __global numtyp4 *restrict lj3, 
+                             const int lj_types, 
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, 
+                             const __global numtyp *restrict q_ ,
+                             const __global numtyp4 *restrict mu_,
+                             const __global numtyp *restrict cutsq, 
+                             const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -125,7 +174,7 @@ __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -322,15 +371,20 @@ __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                                  __global numtyp4* lj3_in, 
-                                  __global numtyp* sp_lj_in,
-                                  __global int *dev_nbor, __global int *dev_packed,
-                                  __global acctyp4 *ans, __global acctyp *engv, 
-                                  const int eflag, const int vflag, const int inum, 
-                                  const int nbor_pitch, __global numtyp *q_,
-                                  __global numtyp4 *mu_,
-                                  __global numtyp *_cutsq, const numtyp qqrd2e,
+__kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_, 
+                                  const __global numtyp4 *restrict lj1_in,
+                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp *restrict sp_lj_in,
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *restrict ans, 
+                                  __global acctyp *restrict engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const __global numtyp *restrict q_,
+                                  const __global numtyp4 *restrict mu_,
+                                  const __global numtyp *restrict _cutsq, 
+                                  const numtyp qqrd2e,
                                   const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -363,7 +417,7 @@ __kernel void k_dipole_lj_sf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 5182f0b11f..c6e8ab9fba 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -46,10 +46,11 @@ template <class numtyp, class acctyp>
 int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
                int **host_type2z2r, int *host_type2frho,
                double ***host_rhor_spline, double ***host_z2r_spline,
-               double ***host_frho_spline, double rdr, double rdrho, int nrhor,
-               int nrho, int nz2r, int nfrho, int nr, const int nlocal,
-               const int nall, const int max_nbors, const int maxspecial,
-               const double cell_size, const double gpu_split, FILE *_screen) 
+               double ***host_frho_spline, double rdr, double rdrho, 
+               double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr,
+               const int nlocal, const int nall, const int max_nbors,
+               const int maxspecial, const double cell_size, 
+               const double gpu_split, FILE *_screen) 
 {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
@@ -97,6 +98,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   _cutforcesq=host_cutforcesq;
   _rdr=rdr;
   _rdrho = rdrho;
+  _rhomax=rhomax;
   _nrhor=nrhor;
   _nrho=nrho;
   _nz2r=nz2r;
@@ -468,15 +470,15 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
                             &this->nbor->dev_nbor,  &this->_nbor_data->begin(), 
                             &_fp, &this->ans->engv, &eflag, &ainum,
                             &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
-                            &_nrho, &_nr, &this->_threads_per_atom);
+                            &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
   } else {
     this->k_energy.set_size(GX,BX);
     this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
                        &rhor_spline2, &frho_spline1, &frho_spline2, 
                        &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, 
                        &this->ans->engv,&eflag, &ainum, &nbor_pitch,
-                       &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr,
-                       &this->_threads_per_atom);
+                       &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho,
+                       &_nr, &this->_threads_per_atom);
   }
 
   this->time_pair.stop();
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index ec20bd672f..aedbf7458f 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -52,8 +52,10 @@ texture<int4> z2r_sp2_tex;
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MAX(A,B) ((A) > (B) ? (A) : (B))
 
+#if (ARCH < 300)
+
 #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
-                        eflag,vflag,engv,rdrho,nrho,i)                      \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=rho;                                                       \
@@ -76,10 +78,11 @@ texture<int4> z2r_sp2_tex;
     if (eflag>0) {                                                          \
       fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
+      if (rho > rhomax) energy += fp*(rho-rhomax);                          \
       engv[ii]=(acctyp)2.0*energy;                                          \
     }                                                                       \
   }
-
+  
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                       offset, elag, vflag, ans, engv)                       \
   if (t_per_atom>1) {                                                       \
@@ -125,18 +128,80 @@ texture<int4> z2r_sp2_tex;
     ans[ii]=f;                                                              \
   }
 
-__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
-                            __global int *type2frho, 
-                            __global numtyp4 *rhor_spline2, 
-                            __global numtyp4 *frho_spline1,
-                            __global numtyp4 *frho_spline2,
-                            __global int *dev_nbor, __global int *dev_packed,
-                            __global numtyp *fp_, __global acctyp *engv, 
-                            const int eflag, const int inum, 
-                            const int nbor_pitch, const int ntypes, 
-                            const numtyp cutforcesq, const numtyp rdr, 
-                            const numtyp rdrho, const int nrho, const int nr,
-                            const int t_per_atom) {
+#else
+
+#define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1)                           \
+        rho += shfl_xor(rho, s, t_per_atom);                                \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    numtyp p = rho*rdrho + (numtyp)1.0;                                     \
+    int m=p;                                                                \
+    m = MAX(1,MIN(m,nrho-1));                                               \
+    p -= m;                                                                 \
+    p = MIN(p,(numtyp)1.0);                                                 \
+    int index = type2frho[itype]*(nrho+1)+m;                                \
+    numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
+    numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
+    fp_[i]=fp;                                                              \
+    if (eflag>0) {                                                          \
+      fetch4(coeff,index,frho_sp2_tex);                                     \
+      energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
+      if (rho > rhomax) energy += fp*(rho-rhomax);                          \
+      engv[ii]=(acctyp)2.0*energy;                                          \
+    }                                                                       \
+  }
+
+#define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
+                          offset, eflag, vflag, ans, engv)                  \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv+=energy;                                                        \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv=virial[i];                                                    \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#endif
+
+__kernel void k_energy(const __global numtyp4 *restrict x_, 
+                       const __global int2 *restrict type2rhor_z2r,
+                       const __global int *restrict type2frho, 
+                       const __global numtyp4 *restrict rhor_spline2, 
+                       const __global numtyp4 *restrict frho_spline1,
+                       const __global numtyp4 *restrict frho_spline2,
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed,
+                       __global numtyp *restrict fp_, 
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int inum, const int nbor_pitch,
+                       const int ntypes,  const numtyp cutforcesq, 
+                       const numtyp rdr, const numtyp rdrho, 
+                       const numtyp rhomax, const int nrho,
+                       const int nr, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -144,7 +209,7 @@ __kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
   acctyp energy = (acctyp)0;
    
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -180,24 +245,26 @@ __kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r,
     } // for nbor
     
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-        eflag,vflag,engv,rdrho,nrho,i);
+        eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
 }
 
-__kernel void k_energy_fast(__global numtyp4 *x_, 
-                                 __global int2 *type2rhor_z2r_in,
-                                 __global int *type2frho_in, 
-                                 __global numtyp4 *rhor_spline2, 
-                                 __global numtyp4 *frho_spline1,
-                                 __global numtyp4 *frho_spline2,
-                                 __global int *dev_nbor, 
-                                 __global int *dev_packed, __global numtyp *fp_, 
-                                 __global acctyp *engv, const int eflag, 
-                                 const int inum, const int nbor_pitch,
-                                 const int ntypes, const numtyp cutforcesq, 
-                                 const numtyp rdr, const numtyp rdrho,
-                                 const int nrho, const int nr, 
-                                 const int t_per_atom) {
+__kernel void k_energy_fast(const __global numtyp4 *restrict x_, 
+                            const __global int2 *restrict type2rhor_z2r_in,
+                            const __global int *restrict type2frho_in, 
+                            const __global numtyp4 *restrict rhor_spline2, 
+                            const __global numtyp4 *restrict frho_spline1,
+                            const __global numtyp4 *restrict frho_spline2,
+                            const __global int *dev_nbor, 
+                            const __global int *dev_packed, 
+                            __global numtyp *restrict fp_, 
+                            __global acctyp *restrict engv, 
+                            const int eflag,  const int inum, 
+                            const int nbor_pitch, const int ntypes, 
+                            const numtyp cutforcesq,  const numtyp rdr,
+                            const numtyp rdrho, const numtyp rhomax, 
+                            const int nrho, const int nr, 
+                            const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -218,7 +285,7 @@ __kernel void k_energy_fast(__global numtyp4 *x_,
   __syncthreads(); 
 
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -254,22 +321,24 @@ __kernel void k_energy_fast(__global numtyp4 *x_,
     } // for nbor
     
     store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-                    eflag,vflag,engv,rdrho,nrho,i);
+                    eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
 }
 
-__kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
-                          __global int2 *type2rhor_z2r,
-                          __global numtyp4 *rhor_spline1, 
-                          __global numtyp4 *z2r_spline1,
-                          __global numtyp4 *z2r_spline2,
-                          __global int *dev_nbor, __global int *dev_packed, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, 
-                          const int inum, const int nbor_pitch,
-                          const int ntypes, const numtyp cutforcesq, 
-                          const numtyp rdr, const int nr,
-                          const int t_per_atom) {
+__kernel void k_eam(const __global numtyp4 *restrict x_, 
+                    const __global numtyp *fp_,
+                    const __global int2 *type2rhor_z2r,
+                    const __global numtyp4 *rhor_spline1, 
+                    const __global numtyp4 *z2r_spline1,
+                    const __global numtyp4 *z2r_spline2,
+                    const __global int *dev_nbor,
+                    const __global int *dev_packed, 
+                    __global acctyp4 *ans,
+                    __global acctyp *engv, 
+                    const int eflag, const int vflag,  const int inum,
+                    const int nbor_pitch, const int ntypes, 
+                    const numtyp cutforcesq,  const numtyp rdr, const int nr,
+                    const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -283,7 +352,7 @@ __kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -364,18 +433,19 @@ __kernel void k_eam(__global numtyp4 *x_, __global numtyp *fp_,
 
 }
 
-__kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
-                          __global int2 *type2rhor_z2r_in,
-                          __global numtyp4 *rhor_spline1, 
-                          __global numtyp4 *z2r_spline1,
-                          __global numtyp4 *z2r_spline2,
-                          __global int *dev_nbor, __global int *dev_packed, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nbor_pitch,
-                          const numtyp cutforcesq, 
-                          const numtyp rdr, const int nr,
-                          const int t_per_atom) {
+__kernel void k_eam_fast(const __global numtyp4 *x_, 
+                         const __global numtyp *fp_,
+                         const __global int2 *type2rhor_z2r_in,
+                         const __global numtyp4 *rhor_spline1, 
+                         const __global numtyp4 *z2r_spline1,
+                         const __global numtyp4 *z2r_spline2,
+                         const __global int *dev_nbor, 
+                         const __global int *dev_packed, 
+                         __global acctyp4 *ans, 
+                         __global acctyp *engv, 
+                         const int eflag, const int vflag, const int inum, 
+                         const int nbor_pitch, const numtyp cutforcesq, 
+                         const numtyp rdr, const int nr, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -395,7 +465,7 @@ __kernel void k_eam_fast(__global numtyp4 *x_, __global numtyp *fp_,
   __syncthreads();
 
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index a22a4551d4..96dcb9bebc 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -41,10 +41,10 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
   int init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
            int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline,
            double ***host_z2r_spline, double ***host_frho_spline, double rdr,
-           double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr,
-           const int nlocal, const int nall, const int max_nbors,
-           const int maxspecial, const double cell_size, const double gpu_split,
-           FILE *_screen);
+           double rdrho, double rhomax, int nrhor, int nrho, int nz2r, 
+           int nfrho, int nr, const int nlocal, const int nall, 
+           const int max_nbors, const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *_screen);
   
   // Copy charges to device asynchronously
   inline void add_fp_data() {
@@ -112,7 +112,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> frho_spline1, frho_spline2;
   UCL_D_Vec<numtyp4> rhor_spline1, rhor_spline2;
     
-  numtyp _cutforcesq,_rdr,_rdrho;
+  numtyp _cutforcesq,_rdr,_rdrho, _rhomax;
   
   int _nfrho,_nrhor,_nrho,_nz2r,_nr;
   
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index 359d949a2d..687a04529f 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -31,7 +31,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
-                 double rdr, double rdrho, int nrhor, 
+                 double rdr, double rdrho, double rhomax, int nrhor, 
                  int nrho, int nz2r, int nfrho, int nr, 
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
@@ -66,9 +66,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
   if (world_me==0)
     init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
-                       host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho,
-                       nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split,
-                       screen);
+                       host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
+                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       gpu_split, screen);
 
   EAMMF.device->world_barrier();
   if (message)
@@ -86,9 +86,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
-                         host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho,
-                         nr, nlocal, nall, 300, maxspecial, cell_size, 
-                         gpu_split, screen);
+                         host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
+                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
     if (message) 
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index e2287c0af2..8311f4d617 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -19,7 +19,7 @@
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 #ifdef NV_KERNEL
-#include "lal_preprocessor.h"
+#include "lal_aux_fun1.h"
 #ifndef _DOUBLE_DOUBLE
 texture<float4> pos_tex, quat_tex;
 #else
@@ -30,11 +30,6 @@ texture<int4,1> pos_tex, quat_tex;
 #define quat_tex qif
 #endif
 
-#define atom_info(t_per_atom, ii, tid, offset)                               \
-  tid=THREAD_ID_X;                                                           \
-  offset=tid & (t_per_atom-1);                                               \
-  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
-
 #define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
                     i, numj, stride, list_end, nbor)                         \
     nbor=nbor_mem+ii;                                                        \
@@ -42,55 +37,11 @@ texture<int4,1> pos_tex, quat_tex;
     nbor+=nbor_stride;                                                       \
     numj=*nbor;                                                              \
     nbor+=nbor_stride;                                                       \
-    list_end=nbor+fast_mul(nbor_stride,numj);                                   \
-    nbor+=fast_mul(offset,nbor_stride);                                         \
+    list_end=nbor+fast_mul(nbor_stride,numj);                                \
+    nbor+=fast_mul(offset,nbor_stride);                                      \
     stride=fast_mul(t_per_atom,nbor_stride);
 
-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    engv+=ii;                                                               \
-    if (eflag>0) {                                                          \
-      *engv=energy;                                                         \
-      engv+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *engv=virial[i];                                                    \
-        engv+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
+#if (ARCH < 300)
 
 #define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
@@ -195,6 +146,80 @@ texture<int4,1> pos_tex, quat_tex;
     ans[ii]=old;                                                            \
   }
 
+#else
+
+#define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=energy;                                                          \
+      ap1+=astride;                                                         \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i];                                                     \
+        ap1+=astride;                                                       \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+astride]=tor;                                                    \
+  }
+
+#define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset,   \
+                    eflag, vflag, ans, engv)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    engv+=ii;                                                               \
+    if (eflag>0) {                                                          \
+      *engv+=energy;                                                        \
+      engv+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *engv+=virial[i];                                                   \
+        engv+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+#endif
+
 /* ----------------------------------------------------------------------
    dot product of 2 vectors
 ------------------------------------------------------------------------- */
diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu
index 0be6c0922d..47ee173a4b 100644
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@@ -29,22 +29,24 @@ texture<int4,1> pos_tex;
 // -- Only unpack neighbors matching the specified inclusive range of forms
 // -- Only unpack neighbors within cutoff
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
-                          const int ntypes, __global int *dev_nbor,
+__kernel void kernel_nbor(const __global numtyp4 *restrict x_, 
+                          const __global numtyp2 *restrict cut_form, 
+                          const int ntypes, 
+                          __global int *dev_nbor,
                           const int nbor_pitch, const int start, const int inum, 
-                          __global int *dev_ij, const int form_low, 
-                          const int form_high) {
+                          const __global int *dev_ij, 
+                          const int form_low, const int form_high) {
                                 
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X+start;
 
   if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
+    const __global int *nbor=dev_ij+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
+    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
     __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -84,11 +86,13 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
 // -- Only unpack neighbors within cutoff
 // -- Fast version of routine that uses shared memory for LJ constants
 // ---------------------------------------------------------------------------
-__kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
-                               __global int *dev_nbor, const int nbor_pitch, 
-                               const int start, const int inum, 
-                               __global int *dev_ij, const int form_low, 
-                               const int form_high) {
+__kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp2 *restrict cut_form,
+                               __global int *dev_nbor, 
+                               const int nbor_pitch, const int start, 
+                               const int inum, 
+                               const __global int *dev_ij, 
+                               const int form_low, const int form_high) {
                                 
   int ii=THREAD_ID_X;
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -101,12 +105,12 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
   __syncthreads();
 
   if (ii<inum) {
-    __global int *nbor=dev_ij+ii;
+    const __global int *nbor=dev_ij+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
+    const __global int *list_end=nbor+fast_mul(numj,nbor_pitch);
     __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
   
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index b078d17c3b..d3fcec6edf 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -24,12 +24,15 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
+__kernel void k_gauss(const __global numtyp4 *restrict x_, 
+                      const __global numtyp4 *restrict gauss1,
                       const int lj_types, 
-                      __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                      __global int *dev_packed, __global acctyp4 *ans,
-                      __global acctyp *engv, const int eflag, 
-                      const int vflag, const int inum,
+                      const __global numtyp *restrict sp_lj_in, 
+                      const __global int *dev_nbor, 
+                      const __global int *dev_packed, 
+                      __global acctyp4 *restrict ans,
+                      __global acctyp *restrict engv, 
+                      const int eflag, const int vflag, const int inum,
                       const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -48,7 +51,7 @@ __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -104,10 +107,13 @@ __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1,
   } // if ii
 }
 
-__kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
-                           __global numtyp* sp_lj_in, 
-                           __global int *dev_nbor, __global int *dev_packed, 
-                           __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_gauss_fast(const __global numtyp4 *restrict x_, 
+                           const __global numtyp4 *restrict gauss1_in,
+                           const __global numtyp *restrict sp_lj_in, 
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed, 
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv, 
                            const int eflag, const int vflag, const int inum, 
                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -131,7 +137,7 @@ __kernel void k_gauss_fast(__global numtyp4 *x_, __global numtyp4 *gauss1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index dbff1178ef..da236fa79a 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -80,13 +80,20 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
 		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }
 
-__kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
-                         __global numtyp4* shape, __global numtyp4* well, 
-                         __global numtyp *gum, __global numtyp2* sig_eps, 
-                         const int ntypes, __global numtyp *lshape, 
-                         __global int *dev_nbor, const int stride, 
-                         __global acctyp4 *ans, const int astride, 
-                         __global acctyp *engv, __global int *err_flag, 
+__kernel void k_gayberne(const __global numtyp4 *restrict x_,
+                         const __global numtyp4 *restrict q,
+                         const __global numtyp4 *restrict shape, 
+                         const __global numtyp4 *restrict well, 
+                         const __global numtyp *restrict gum, 
+                         const __global numtyp2 *restrict sig_eps, 
+                         const int ntypes, 
+                         const __global numtyp *restrict lshape, 
+                         const __global int *dev_nbor, 
+                         const int stride, 
+                         __global acctyp4 *restrict ans, 
+                         const int astride, 
+                         __global acctyp *restrict engv, 
+                         __global int *restrict err_flag, 
                          const int eflag, const int vflag, const int inum,
                          const int t_per_atom) {
   int tid, ii, offset;
@@ -112,7 +119,7 @@ __kernel void k_gayberne(__global numtyp4* x_,__global numtyp4 *q,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
     int i, numj, n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index 05a9b1008a..e48b060026 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -17,15 +17,22 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-__kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
-                  __global numtyp4 *q, __global numtyp4* shape,
-                  __global numtyp4* well, __global numtyp *gum, 
-                  __global numtyp2* sig_eps, const int ntypes, 
-                  __global numtyp *lshape, __global int *dev_nbor, 
-                  const int stride, __global acctyp4 *ans, 
-                  __global acctyp *engv, __global int *err_flag, 
-                  const int eflag, const int vflag,const int start, 
-                  const int inum, const int t_per_atom) {
+__kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
+                                          const __global numtyp4 *restrict q, 
+                                          const __global numtyp4 *restrict shape,
+                                          const __global numtyp4 *restrict well, 
+                                          const __global numtyp *restrict gum, 
+                                          const __global numtyp2 *restrict sig_eps,
+                                          const int ntypes, 
+                                          const __global numtyp *restrict lshape,
+                                          const __global int *dev_nbor, 
+                                          const int stride,
+                                          __global acctyp4 *restrict ans, 
+                                          __global acctyp *restrict engv,
+                                          __global int *restrict err_flag, 
+                                          const int eflag, const int vflag,
+                                          const int start, const int inum, 
+                                          const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -46,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
     int i, numj, n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
@@ -236,11 +243,16 @@ __kernel void k_gayberne_sphere_ellipsoid(__global numtyp4 *x_,
   } // if ii
 }
 
-__kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                            __global numtyp4* lj3, const int lj_types, 
-                            __global numtyp *gum, const int stride, 
-                            __global int *dev_ij, __global acctyp4 *ans, 
-                            __global acctyp *engv, __global int *err_flag, 
+__kernel void k_gayberne_lj(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1, 
+                            const __global numtyp4 *restrict lj3, 
+                            const int lj_types, 
+                            const __global numtyp *restrict gum, 
+                            const int stride, 
+                            const __global int *dev_ij, 
+                            __global acctyp4 *restrict ans, 
+                            __global acctyp *restrict engv, 
+                            __global int *restrict err_flag, 
                             const int eflag, const int vflag, const int start,
                             const int inum, const int t_per_atom) {
   int tid, ii, offset;
@@ -263,7 +275,7 @@ __kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
@@ -318,13 +330,18 @@ __kernel void k_gayberne_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
-                                 __global numtyp4* lj3_in, __global numtyp *gum, 
-                                 const int stride, __global int *dev_ij,
-                                 __global acctyp4 *ans, __global acctyp *engv,
-                                 __global int *err_flag, const int eflag,
-                                 const int vflag, const int start, 
-                                 const int inum, const int t_per_atom) {
+__kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_, 
+                                 const __global numtyp4 *restrict lj1_in, 
+                                 const __global numtyp4 *restrict lj3_in, 
+                                 const __global numtyp *restrict gum, 
+                                 const int stride, 
+                                 const __global int *dev_ij,
+                                 __global acctyp4 *restrict ans, 
+                                 __global acctyp *restrict engv,
+                                 __global int *restrict err_flag, 
+                                 const int eflag, const int vflag, 
+                                 const int start, const int inum, 
+                                 const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -352,7 +369,7 @@ __kernel void k_gayberne_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 0a049d187e..b230927c1f 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj(const __global numtyp4 *restrict x_, 
+                   const __global numtyp4 *restrict lj1,
+                   const __global numtyp4 *restrict lj3, 
+                   const int lj_types, 
+                   const __global numtyp *restrict sp_lj_in, 
+                   const __global int * dev_nbor, 
+                   const __global int * dev_packed, 
+                   __global acctyp4 *restrict ans, 
+                   __global acctyp *restrict engv, 
+                   const int eflag, const int vflag, const int inum,
+                   const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -48,7 +52,7 @@ __kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -103,13 +107,16 @@ __kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj_fast(const __global numtyp4 *restrict x_, 
+                        const __global numtyp4 *restrict lj1_in,
+                        const __global numtyp4 *restrict lj3_in, 
+                        const __global numtyp *restrict sp_lj_in, 
+                        const __global int * dev_nbor, 
+                        const __global int * dev_packed, 
+                        __global acctyp4 *restrict ans, 
+                        __global acctyp *restrict engv, 
+                        const int eflag, const int vflag, const int inum, 
+                        const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -134,7 +141,7 @@ __kernel void k_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index a5fc6c89df..141c830a0a 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -24,13 +24,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj96(const __global numtyp4 *restrict x_,
+                     const __global numtyp4 *restrict lj1,
+                     const __global numtyp4 *restrict lj3, 
+                     const int lj_types, 
+                     const __global numtyp *restrict sp_lj_in, 
+                     const __global int *dev_nbor, 
+                     const __global int *dev_packed, 
+                     __global acctyp4 *restrict ans,
+                     __global acctyp *restrict engv, 
+                     const int eflag, const int vflag, const int inum,
+                     const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -48,7 +52,7 @@ __kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -104,13 +108,16 @@ __kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_lj96_fast(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1_in,
+                          const __global numtyp4 *restrict lj3_in, 
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -135,7 +142,7 @@ __kernel void k_lj96_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index e08baf5a5f..5262c11147 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict lj1,
+                               const __global numtyp4 *restrict lj3,
+                               const int lj_types, 
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag,  const int vflag, 
+                               const int inum, const int nbor_pitch, 
+                               const __global numtyp *restrict q_, 
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -60,7 +65,7 @@ __kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -141,15 +146,21 @@ __kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_, 
+                                    const __global numtyp4 *restrict lj1_in,
+                                    const __global numtyp4 *restrict lj3_in, 
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor, 
+                                    const __global int *dev_packed,
+                                    __global acctyp4 *restrict ans, 
+                                    __global acctyp *restrict engv, 
+                                    const int eflag, const int vflag, 
+                                    const int inum, const int nbor_pitch, 
+                                    const __global numtyp *restrict q_,
+                                    const numtyp cut_coulsq, 
+                                    const numtyp qqrd2e,
+                                    const numtyp g_ewald, 
+                                    const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -175,7 +186,7 @@ __kernel void k_lj_class2_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index feccf7fc4c..48e8399183 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          __global numtyp *cutsq, const numtyp qqrd2e,
-                          const int t_per_atom) {
+__kernel void k_lj_coul(const __global numtyp4 *restrict x_, 
+                        const __global numtyp4 *restrict lj1,
+                        const __global numtyp4 *restrict  lj3, 
+                        const int lj_types, 
+                        const __global numtyp *restrict sp_lj_in,
+                        const __global int *dev_nbor, 
+                        const __global int *dev_packed, 
+                        __global acctyp4 *restrict ans,
+                        __global acctyp *restrict engv, 
+                        const int eflag, const int vflag, const int inum,
+                        const int nbor_pitch, 
+                        const __global numtyp *restrict q_, 
+                        const __global numtyp *restrict cutsq, 
+                        const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -60,7 +65,7 @@ __kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -132,15 +137,19 @@ __kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp *_cutsq, const numtyp qqrd2e,
-                               const int t_per_atom) {
+__kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
+                             const __global numtyp4 *restrict lj1_in,
+                             const __global numtyp4 *restrict lj3_in, 
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, 
+                             const __global numtyp *restrict q_,
+                             const __global numtyp *restrict _cutsq, 
+                             const numtyp qqrd2e, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -168,7 +177,7 @@ __kernel void k_lj_coul_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index e63d7a40e3..dc5a85a9b9 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -29,16 +29,21 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_ ,
-                          __global numtyp *cutsq, const numtyp qqrd2e,
-                          const numtyp kappa,
-                          const int t_per_atom) {
+__kernel void k_lj_debye(const __global numtyp4 *restrict x_, 
+                         const __global numtyp4 *restrict lj1,
+                         const __global numtyp4 *restrict lj3, 
+                         const int lj_types, 
+                         const __global numtyp *restrict sp_lj_in, 
+                         const __global int *dev_nbor, 
+                         const __global int *dev_packed, 
+                         __global acctyp4 *restrict ans,
+                         __global acctyp *restrict engv,
+                         const int eflag, const int vflag, const int inum,
+                         const int nbor_pitch,
+                         const __global numtyp *restrict q_ ,
+                         const __global numtyp *restrict cutsq, 
+                         const numtyp qqrd2e, const numtyp kappa,
+                         const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -61,7 +66,7 @@ __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -139,16 +144,20 @@ __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               __global numtyp *_cutsq, const numtyp qqrd2e,
-                               const numtyp kappa,
-                               const int t_per_atom) {
+__kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
+                              const __global numtyp4 *restrict lj1_in,
+                              const __global numtyp4 *restrict lj3_in, 
+                              const __global numtyp *restrict sp_lj_in,
+                              const __global int *dev_nbor,
+                              const __global int *dev_packed,
+                              __global acctyp4 *restrict ans,
+                              __global acctyp *restrict engv, 
+                              const int eflag, const int vflag, const int inum, 
+                              const int nbor_pitch, 
+                              const __global numtyp *restrict q_, 
+                              const __global numtyp *restrict _cutsq,
+                              const numtyp qqrd2e, const numtyp kappa,
+                              const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -176,7 +185,7 @@ __kernel void k_lj_debye_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index 9b655ab304..2fcf645017 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -29,15 +29,20 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, __global numtyp *q_,
-                          const numtyp cut_coulsq, const numtyp qqrd2e,
-                          const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1,
+                             const __global numtyp4 *restrict lj3, 
+                             const int lj_types, 
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch, 
+                             const __global numtyp *restrict q_,
+                             const numtyp cut_coulsq, const numtyp qqrd2e,
+                             const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -60,7 +65,7 @@ __kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -139,15 +144,19 @@ __kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, __global numtyp *q_,
-                               const numtyp cut_coulsq, const numtyp qqrd2e,
-                               const numtyp g_ewald, const int t_per_atom) {
+__kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_, 
+                                  const __global numtyp4 *restrict lj1_in,
+                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp *restrict sp_lj_in,
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *restrict ans, 
+                                  __global acctyp *restrict engv, 
+                                  const int eflag, const int vflag,
+                                  const int inum,  const int nbor_pitch,
+                                  const __global numtyp *restrict q_,
+                                  const numtyp cut_coulsq, const numtyp qqrd2e,
+                                  const numtyp g_ewald, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -173,7 +182,7 @@ __kernel void k_lj_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index 9073d72884..757e36cbc5 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -31,13 +31,18 @@ texture<int2> q_tex;
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
-__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
-                       __global numtyp4* lj3, const int lj_types, 
-                       __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                       __global int *dev_packed, __global acctyp4 *ans,
-                       __global acctyp *engv, const int eflag,
-                       const int vflag, const int inum,
-                       const int nbor_pitch, __global numtyp *q_ ,
+__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, 
+                       const __global numtyp4 *restrict lj1,
+                       const __global numtyp4 *restrict lj3, 
+                       const int lj_types, 
+                       const __global numtyp *restrict sp_lj_in, 
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed, 
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int vflag, const int inum,
+                       const int nbor_pitch, 
+                       const __global numtyp *restrict q_ ,
                        const numtyp cut_coulsq, const numtyp qqrd2e,
                        const numtyp e_shift, const numtyp f_shift, 
                        const numtyp alpha, const int t_per_atom) {
@@ -63,7 +68,7 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -72,6 +77,12 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
@@ -107,8 +118,8 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
           numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
           erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                   rsq*f_shift);
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+            rsq*f_shift);
         } else
           forcecoul = (numtyp)0.0;
 
@@ -144,12 +155,17 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                            __global numtyp4* lj3_in, __global numtyp* sp_lj_in,
-                            __global int *dev_nbor, __global int *dev_packed,
-                            __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict lj1_in,
+                            const __global numtyp4 *restrict lj3_in,
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv, 
                             const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, __global numtyp *q_,
+                            const int nbor_pitch,
+                            const __global numtyp *restrict q_,
                             const numtyp cut_coulsq, const numtyp qqrd2e,
                             const numtyp e_shift, const numtyp f_shift, 
                             const numtyp alpha, const int t_per_atom) {
@@ -178,7 +194,7 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -188,6 +204,12 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * 
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
     for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
@@ -222,8 +244,8 @@ __kernel void k_lj_dsf_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
           numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
           erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
-          forcecoul = prefactor * (erfcc + 2.0*alpha/MY_PIS*r*erfcd + 
-                                   rsq*f_shift);
+          forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd + 
+            rsq*f_shift);
         } else
           forcecoul = (numtyp)0.0;
 
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index 3e98ed7d9c..ee3ef60b68 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -26,12 +26,16 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          __global numtyp4* lj3, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
+__kernel void k_lj_expand(const __global numtyp4 *restrict x_, 
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3, 
+                          const int lj_types, 
+                          const __global numtyp *restrict sp_lj_in, 
+                          const __global int *dev_nbor, 
+                          const __global int *dev_packed, 
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv, 
+                          const int eflag, const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -50,7 +54,7 @@ __kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -108,11 +112,14 @@ __kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
-                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
+__kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_, 
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans, 
+                               __global acctyp *restrict engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
@@ -139,7 +146,7 @@ __kernel void k_lj_expand_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index 4d89180390..55847f3d36 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -26,13 +26,17 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
-                          __global numtyp2* mor2, const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_morse(const __global numtyp4 *restrict x_, 
+                      const __global numtyp4 *restrict mor1,
+                      const __global numtyp2 *restrict mor2, 
+                      const int lj_types, 
+                      const __global numtyp *restrict sp_lj_in, 
+                      const __global int *dev_nbor, 
+                      const __global int *dev_packed, 
+                      __global acctyp4 *restrict ans,
+                      __global acctyp *restrict engv,
+                      const int eflag, const int vflag, const int inum,
+                      const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -50,7 +54,7 @@ __kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -106,13 +110,16 @@ __kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1,
   } // if ii
 }
 
-__kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
-                               __global numtyp2* mor2_in, 
-                               __global numtyp* sp_lj_in,
-                               __global int *dev_nbor, __global int *dev_packed,
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_morse_fast(const __global numtyp4 *restrict x_, 
+                           const __global numtyp4 *restrict mor1_in,
+                           const __global numtyp2 *restrict mor2_in, 
+                           const __global numtyp *restrict sp_lj_in,
+                           const __global int *dev_nbor,
+                           const __global int *dev_packed,
+                           __global acctyp4 *restrict ans,
+                           __global acctyp *restrict engv, 
+                           const int eflag, const int vflag, const int inum, 
+                           const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -137,7 +144,7 @@ __kernel void k_morse_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu
index 4fbf29f100..1a1d392032 100644
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@@ -17,7 +17,8 @@
 #include "lal_preprocessor.h"
 #endif
 
-__kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
+__kernel void kernel_unpack(__global int *dev_nbor, 
+                            const __global int *dev_ij,
                             const int inum, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int offset=tid & (t_per_atom-1);
@@ -27,8 +28,8 @@ __kernel void kernel_unpack(__global int *dev_nbor, __global int *dev_ij,
     __global int *nbor=dev_nbor+ii+inum;
     int numj=*nbor;
     nbor+=inum;
-    __global int *list=dev_ij+*nbor;
-    __global int *list_end=list+numj;
+    const __global int *list=dev_ij+*nbor;
+    const __global int *list_end=list+numj;
     list+=offset;
     nbor+=fast_mul(ii,t_per_atom-1)+offset;
     int stride=fast_mul(t_per_atom,inum);
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index ebd18e2b2b..cc4e210932 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -22,7 +22,9 @@ texture<float4> pos_tex;
 texture<int4,1> pos_tex;
 #endif
 
-__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
+__kernel void calc_cell_id(const numtyp4 *restrict pos, 
+                           unsigned *restrict cell_id, 
+                           int *restrict particle_id,
                            numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, 
                            numtyp i_cell_size, int ncellx, int ncelly, 
                            int ncellz, int inum, int nall, 
@@ -62,8 +64,9 @@ __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id,
   }
 }
 
-__kernel void kernel_calc_cell_counts(unsigned *cell_id,
-                                      int *cell_counts, int nall, int ncell) {
+__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
+                                      int *restrict cell_counts, 
+                                      int nall, int ncell) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < nall) {
     int id = cell_id[idx];
@@ -94,8 +97,9 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
 
 
 
-__kernel void transpose(__global int *out, __global int *in, int columns_in, 
-                        int rows_in)
+__kernel void transpose(__global int *restrict out, 
+                        const __global int *restrict in, 
+                        int columns_in, int rows_in)
 {
 	__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	
@@ -117,9 +121,9 @@ __kernel void transpose(__global int *out, __global int *in, int columns_in,
 		out[j*rows_in+i] = block[ti][tj];
 }
 
-__kernel void calc_neigh_list_cell(__global numtyp4 *x_, 
-                                   __global int *cell_particle_id, 
-                                   __global int *cell_counts, 
+__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, 
+                                   const __global int *restrict cell_particle_id, 
+                                   const __global int *restrict cell_counts, 
                                    __global int *nbor_list,
                                    __global int *host_nbor_list, 
                                    __global int *host_numj, 
@@ -234,8 +238,10 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
 
 __kernel void kernel_special(__global int *dev_nbor, 
                              __global int *host_nbor_list, 
-                             __global int *host_numj, __global int *tag,
-                             __global int *nspecial, __global int *special,
+                             const __global int *host_numj, 
+                             const __global int *restrict tag,
+                             const __global int *restrict nspecial, 
+                             const __global int *restrict special,
                              int inum, int nt, int max_nbors, int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index 646afa5900..0ed5c6bad5 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -41,15 +41,18 @@ texture<int2> q_tex;
 // Number of pencils per block for charge spread
 #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)
 
-__kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
+__kernel void particle_map(const __global numtyp4 *restrict x_,  
+                           const __global numtyp *restrict q_,
                            const grdtyp delvolinv, const int nlocal, 
-                           __global int *counts, __global grdtyp4 *ans, 
+                           __global int *restrict counts, 
+                           __global grdtyp4 *restrict ans, 
                            const grdtyp b_lo_x, const grdtyp b_lo_y,
                            const grdtyp b_lo_z, const grdtyp delxinv,
                            const grdtyp delyinv, const grdtyp delzinv,
                            const int nlocal_x, const int nlocal_y,
                            const int nlocal_z, const int atom_stride,
-                           const int max_atoms, __global int *error) {
+                           const int max_atoms, 
+                           __global int *restrict error) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
 
@@ -97,8 +100,10 @@ __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
 
 /* --------------------------- */
 
-__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
-                       __global grdtyp *brick, __global grdtyp *_rho_coeff,
+__kernel void make_rho(const __global int *restrict counts, 
+                       const __global grdtyp4 *restrict atoms,
+                       __global grdtyp *restrict brick, 
+                       const __global grdtyp *restrict _rho_coeff,
                        const int atom_stride, const int npts_x,
                        const int npts_y, const int npts_z, const int nlocal_x,
                        const int nlocal_y, const int nlocal_z,
@@ -192,15 +197,17 @@ __kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
   }
 }
 
-__kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
-                     const int nlocal, __global grdtyp4 *brick,
-                     __global grdtyp *_rho_coeff, const int npts_x,
-                     const int npts_yx, const grdtyp b_lo_x,
+__kernel void interp(const __global numtyp4 *restrict x_, 
+                     const __global numtyp *restrict q_,
+                     const int nlocal, 
+                     const __global grdtyp4 *restrict brick,
+                     const __global grdtyp *restrict _rho_coeff, 
+                     const int npts_x, const int npts_yx, const grdtyp b_lo_x,
                      const grdtyp b_lo_y, const grdtyp b_lo_z,
                      const grdtyp delxinv,  const grdtyp delyinv,
                      const grdtyp delzinv, const int order,
                      const int order2, const grdtyp qqrd2e_scale, 
-                     __global acctyp4 *ans) {
+                     __global acctyp4 *restrict ans) {
   __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
   __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
   __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index b817bbe551..f681268f1c 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -56,8 +56,7 @@
 //     Definition:   Default thread block size for "bio" pair styles
 //  MAX_BIO_SHARED_TYPES
 //     Definition:   Max # of atom type params can be stored in shared memory
-//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
-//                    MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
+//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
 //
 //*************************************************************************/
 
@@ -80,6 +79,7 @@
 #define __kernel extern "C" __global__
 #define __local __shared__
 #define __global  
+#define restrict __restrict__
 #define atom_add atomicAdd
 #define ucl_inline static __inline__ __device__ 
 
@@ -116,8 +116,22 @@
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_PAIR 512
 #define BLOCK_BIO_PAIR 512
+#define BLOCK_ELLIPSE 256
 #define MAX_SHARED_TYPES 11
 
+#ifdef _SINGLE_SINGLE
+#define shfl_xor __shfl_xor
+#else
+ucl_inline double shfl_xor(double var, int laneMask, int width) {
+  int2 tmp;
+  tmp.x = __double2hiint(var);
+  tmp.y = __double2loint(var);
+  tmp.x = __shfl_xor(tmp.x,laneMask,width);
+  tmp.y = __shfl_xor(tmp.y,laneMask,width);
+  return __hiloint2double(tmp.x,tmp.y);
+}
+#endif
+
 #endif
 
 #endif
@@ -380,3 +394,7 @@ typedef struct _double4 double4;
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
 
+#ifndef BLOCK_ELLIPSE
+#define BLOCK_ELLIPSE BLOCK_PAIR
+#endif
+
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index c858b09801..28e15b5cdf 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -32,14 +32,20 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9])
   return ans;
 }
 
-__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
-                          __global numtyp4* shape, __global numtyp4* well, 
-                          __global numtyp *splj, __global numtyp2* sig_eps, 
-                          const int ntypes, __global int *dev_nbor,
-                          const int stride,  __global acctyp4 *ans,
-                          const int astride, __global acctyp *engv,
-                          __global int *err_flag, const int eflag,
-                          const int vflag, const int inum,
+__kernel void k_resquared(const __global numtyp4 *restrict x_,
+                          const __global numtyp4 *restrict q,
+                          const __global numtyp4 *restrict shape, 
+                          const __global numtyp4 *restrict well, 
+                          const __global numtyp *restrict splj, 
+                          const __global numtyp2 *restrict sig_eps, 
+                          const int ntypes, 
+                          const __global int *dev_nbor,
+                          const int stride,  
+                          __global acctyp4 *restrict ans,
+                          const int astride, 
+                          __global acctyp *restrict engv,
+                          __global int *restrict err_flag, 
+                          const int eflag, const int vflag, const int inum,
                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
@@ -68,7 +74,7 @@ __kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
     int i, numj, n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index 5c46c21e45..d287409d88 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -17,15 +17,132 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-__kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
-                  __global numtyp4 *q, __global numtyp4* shape, 
-                  __global numtyp4* well, __global numtyp *splj, 
-                  __global numtyp2* sig_eps, const int ntypes, 
-                  __global int *dev_nbor, const int stride, 
-                  __global acctyp4 *ans, const int astride, 
-                  __global acctyp *engv, __global int *err_flag, 
-                   const int eflag, const int vflag, const int inum,
-                   const int t_per_atom) {
+#if (ARCH < 300)
+
+#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,           \
+                         t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                        \
+    __local acctyp red_acc[7][BLOCK_PAIR];                                   \
+    red_acc[0][tid]=f.x;                                                     \
+    red_acc[1][tid]=f.y;                                                     \
+    red_acc[2][tid]=f.z;                                                     \
+    red_acc[3][tid]=tor.x;                                                   \
+    red_acc[4][tid]=tor.y;                                                   \
+    red_acc[5][tid]=tor.z;                                                   \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                          \
+      if (offset < s) {                                                      \
+        for (int r=0; r<6; r++)                                              \
+          red_acc[r][tid] += red_acc[r][tid+s];                              \
+      }                                                                      \
+    }                                                                        \
+    f.x=red_acc[0][tid];                                                     \
+    f.y=red_acc[1][tid];                                                     \
+    f.z=red_acc[2][tid];                                                     \
+    tor.x=red_acc[3][tid];                                                   \
+    tor.y=red_acc[4][tid];                                                   \
+    tor.z=red_acc[5][tid];                                                   \
+    if (eflag>0 || vflag>0) {                                                \
+      for (int r=0; r<6; r++)                                                \
+        red_acc[r][tid]=virial[r];                                           \
+      red_acc[6][tid]=energy;                                                \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                        \
+        if (offset < s) {                                                    \
+          for (int r=0; r<7; r++)                                            \
+            red_acc[r][tid] += red_acc[r][tid+s];                            \
+        }                                                                    \
+      }                                                                      \
+      for (int r=0; r<6; r++)                                                \
+        virial[r]=red_acc[r][tid];                                           \
+      energy=red_acc[6][tid];                                                \
+    }                                                                        \
+  }                                                                          \
+  if (offset==0) {                                                           \
+    __global acctyp *ap1=engv+ii;                                            \
+    if (eflag>0) {                                                           \
+      *ap1+=energy;                                                          \
+      ap1+=astride;                                                          \
+    }                                                                        \
+    if (vflag>0) {                                                           \
+      for (int i=0; i<6; i++) {                                              \
+        *ap1+=virial[i];                                                     \
+        ap1+=astride;                                                        \
+      }                                                                      \
+    }                                                                        \
+    acctyp4 old=ans[ii];                                                     \
+    old.x+=f.x;                                                              \
+    old.y+=f.y;                                                              \
+    old.z+=f.z;                                                              \
+    ans[ii]=old;                                                             \
+    old=ans[ii+astride];                                                     \
+    old.x+=tor.x;                                                            \
+    old.y+=tor.y;                                                            \
+    old.z+=tor.z;                                                            \
+    ans[ii+astride]=old;                                                     \
+  }
+
+#else
+
+#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,          \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1+=energy;                                                         \
+      ap1+=astride;                                                         \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1+=virial[i];                                                    \
+        ap1+=astride;                                                       \
+      }                                                                     \
+    }                                                                       \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+    old=ans[ii+astride];                                                    \
+    old.x+=tor.x;                                                           \
+    old.y+=tor.y;                                                           \
+    old.z+=tor.z;                                                           \
+    ans[ii+astride]=old;                                                    \
+  }
+
+#endif
+
+__kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
+                                           const __global numtyp4 *restrict q,
+                                           const __global numtyp4 *restrict shape, 
+                                           const __global numtyp4 *restrict well,
+                                           const __global numtyp *restrict splj, 
+                                           const __global numtyp2 *restrict sig_eps,
+                                           const int ntypes, 
+                                           const __global int *dev_nbor,
+                                           const int stride, 
+                                           __global acctyp4 *restrict ans,
+                                           const int astride, 
+                                           __global acctyp *restrict engv, 
+                                           __global int *restrict err_flag, 
+                                           const int eflag, const int vflag, 
+                                           const int inum, 
+                                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -55,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
     int i, numj, n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
@@ -260,86 +377,26 @@ __kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_,
       }
 
     } // for nbor
-
-    // Reduce answers
-    if (t_per_atom>1) {
-      __local acctyp red_acc[7][BLOCK_PAIR];
-      
-      red_acc[0][tid]=f.x;
-      red_acc[1][tid]=f.y;
-      red_acc[2][tid]=f.z;
-      red_acc[3][tid]=tor.x;
-      red_acc[4][tid]=tor.y;
-      red_acc[5][tid]=tor.z;
-  
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        if (offset < s) {
-          for (int r=0; r<6; r++)
-            red_acc[r][tid] += red_acc[r][tid+s];
-        }
-      }
-      
-      f.x=red_acc[0][tid];
-      f.y=red_acc[1][tid];
-      f.z=red_acc[2][tid];
-      tor.x=red_acc[3][tid];
-      tor.y=red_acc[4][tid];
-      tor.z=red_acc[5][tid];
-  
-      if (eflag>0 || vflag>0) {
-        for (int r=0; r<6; r++)
-          red_acc[r][tid]=virial[r];
-        red_acc[6][tid]=energy;
-  
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<7; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-      
-        for (int r=0; r<6; r++)
-          virial[r]=red_acc[r][tid];
-        energy=red_acc[6][tid];
-      }
-    }
-  
-    // Store answers
-    if (offset==0) {
-      __global acctyp *ap1=engv+ii;
-      if (eflag>0) {
-        *ap1+=energy;
-        ap1+=astride;
-      }
-      if (vflag>0) {
-        for (int i=0; i<6; i++) {
-          *ap1+=virial[i];
-          ap1+=astride;
-        }
-      }
-      acctyp4 old=ans[ii];
-      old.x+=f.x;
-      old.y+=f.y;
-      old.z+=f.z;
-      ans[ii]=old;
-      
-      old=ans[ii+astride];
-      old.x+=tor.x;
-      old.y+=tor.y;
-      old.z+=tor.z;
-      ans[ii+astride]=old;
-    }
+    store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
   } // if ii
 }
 
-__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
-                  __global numtyp4 *q, __global numtyp4* shape,
-                  __global numtyp4* well, __global numtyp *splj,
-                  __global numtyp2* sig_eps, const int ntypes, 
-                  __global int *dev_nbor, const int stride, 
-                  __global acctyp4 *ans, __global acctyp *engv, 
-                  __global int *err_flag, const int eflag, const int vflag,
-                  const int start, const int inum, const int t_per_atom) {
+__kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
+                                           const __global numtyp4 *restrict q, 
+                                           const __global numtyp4 *restrict shape,
+                                           const __global numtyp4 *restrict well,
+                                           const __global numtyp *restrict splj,
+                                           const __global numtyp2 *restrict sig_eps,
+                                           const int ntypes, 
+                                           const __global int *dev_nbor,
+                                           const int stride, 
+                                           __global acctyp4 *restrict ans,
+                                           __global acctyp *restrict engv, 
+                                           __global int *restrict err_flag,
+                                           const int eflag, const int vflag,
+                                           const int start, const int inum, 
+                                           const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -366,7 +423,7 @@ __kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
     virial[i]=(acctyp)0;
 
   if (ii<inum) {
-    __global int *nbor, *nbor_end;
+    const __global int *nbor, *nbor_end;
     int j, numj, n_stride;
     nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                 n_stride,nbor_end,nbor);
@@ -525,11 +582,16 @@ __kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_,
   } // if ii
 }
 
-__kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
-                             __global numtyp4* lj3, const int lj_types, 
-                             __global numtyp *gum, const int stride, 
-                             __global int *dev_ij, __global acctyp4 *ans,
-                             __global acctyp *engv, __global int *err_flag,
+__kernel void k_resquared_lj(const __global numtyp4 *restrict x_, 
+                             const __global numtyp4 *restrict lj1, 
+                             const __global numtyp4 *restrict lj3, 
+                             const int lj_types, 
+                             const __global numtyp *restrict gum, 
+                             const int stride, 
+                             const __global int *dev_ij, 
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv, 
+                             __global int *restrict err_flag,
                              const int eflag, const int vflag, const int start,
                              const int inum, const int t_per_atom) {
   int tid, ii, offset;
@@ -552,7 +614,7 @@ __kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
@@ -606,12 +668,18 @@ __kernel void k_resquared_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
   } // if ii
 }
 
-__kernel void k_resquared_lj_fast(__global numtyp4 *x_, 
-                  __global numtyp4 *lj1_in, __global numtyp4* lj3_in, 
-                  __global numtyp *gum, const int stride, __global int *dev_ij,
-                  __global acctyp4 *ans, __global acctyp *engv, 
-                  __global int *err_flag, const int eflag, const int vflag,
-                  const int start, const int inum, const int t_per_atom) {
+__kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_, 
+                                  const __global numtyp4 *restrict lj1_in, 
+                                  const __global numtyp4 *restrict lj3_in, 
+                                  const __global numtyp *restrict gum, 
+                                  const int stride,
+                                  const __global int *dev_ij,
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv, 
+                                  __global int *restrict err_flag,
+                                  const int eflag, const int vflag,
+                                  const int start, const int inum, 
+                                  const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   ii+=start;
@@ -639,7 +707,7 @@ __kernel void k_resquared_lj_fast(__global numtyp4 *x_,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index 4730c52cd1..ed843b59ca 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -39,18 +39,21 @@ typedef union {
 
 /// ---------------- LOOKUP -------------------------------------------------
 
-__kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table(const __global numtyp4 *restrict x_, 
+                      const __global int *restrict tabindex,
+                      const __global numtyp4 *restrict coeff2, 
+                      const __global numtyp4 *restrict coeff3,
+                      const __global numtyp4 *restrict coeff4,
+                      const int lj_types,
+                      const __global numtyp *restrict cutsq,
+                      const __global numtyp *restrict sp_lj_in, 
+                      const __global int *dev_nbor, 
+                      const __global int *dev_packed, 
+                      __global acctyp4 *restrict ans, 
+                      __global acctyp *restrict engv, 
+                      const int eflag, const int vflag, const int inum, 
+                      const int nbor_pitch, const int t_per_atom, 
+                      int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -70,7 +73,7 @@ __kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -130,17 +133,20 @@ __kernel void k_table(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_fast(const __global numtyp4 *restrict x_,
+                           const __global int *restrict tabindex,
+                           const __global numtyp4 *restrict coeff2, 
+                           const __global numtyp4 *restrict coeff3,
+                           const __global numtyp4 *restrict coeff4,
+                           const __global numtyp *restrict cutsq_in,
+                           const __global numtyp *restrict sp_lj_in, 
+                           const __global int *dev_nbor, 
+                           const __global int *dev_packed, 
+                           __global acctyp4 *restrict ans, 
+                           __global acctyp *restrict engv, 
+                           const int eflag, const int vflag, const int inum, 
+                           const int nbor_pitch, const int t_per_atom, 
+                           int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -164,7 +170,7 @@ __kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -227,18 +233,21 @@ __kernel void k_table_fast(__global numtyp4 *x_, __global int *tabindex,
 
 /// ---------------- LINEAR -------------------------------------------------
 
-__kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table_linear(const __global numtyp4 *restrict x_, 
+                             const __global int *restrict tabindex,
+                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff3,
+                             const __global numtyp4 *restrict coeff4,
+                             const int lj_types,
+                             const __global numtyp *restrict cutsq,
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans, 
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, const int t_per_atom, 
+                             int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -258,7 +267,7 @@ __kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -322,17 +331,20 @@ __kernel void k_table_linear(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_linear_fast(const __global numtyp4 *restrict x_, 
+                                  const __global int *restrict tabindex,
+                                  const __global numtyp4 *restrict coeff2, 
+                                  const __global numtyp4 *restrict coeff3,
+                                  const __global numtyp4 *restrict coeff4,
+                                  const __global numtyp *restrict cutsq_in,
+                                  const __global numtyp *restrict sp_lj_in, 
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed, 
+                                  __global acctyp4 *restrict ans, 
+                                  __global acctyp *restrict engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -356,7 +368,7 @@ __kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -423,18 +435,21 @@ __kernel void k_table_linear_fast(__global numtyp4 *x_, __global int *tabindex,
 
 /// ---------------- SPLINE -------------------------------------------------
 
-__kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table_spline(const __global numtyp4 *restrict x_, 
+                             const __global int *restrict tabindex,
+                             const __global numtyp4 *restrict coeff2, 
+                             const __global numtyp4 *restrict coeff3,
+                             const __global numtyp4 *restrict coeff4,
+                             const int lj_types,
+                             const __global numtyp *restrict cutsq,
+                             const __global numtyp *restrict sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *restrict ans, 
+                             __global acctyp *restrict engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, const int t_per_atom, 
+                             int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -454,7 +469,7 @@ __kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -525,17 +540,20 @@ __kernel void k_table_spline(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_spline_fast(const __global numtyp4 *x_, 
+                                  const __global int *tabindex,
+                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4 *coeff3,
+                                  const __global numtyp4 *coeff4,
+                                  const __global numtyp *cutsq_in,
+                                  const __global numtyp* sp_lj_in, 
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed, 
+                                  __global acctyp4 *ans, 
+                                  __global acctyp *engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -559,7 +577,7 @@ __kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -633,19 +651,23 @@ __kernel void k_table_spline_fast(__global numtyp4 *x_, __global int *tabindex,
 
 /// ---------------- BITMAP -------------------------------------------------
 
-__kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
-                       __global int *nshiftbits, __global int *nmask,
-                       __global numtyp4* coeff2, 
-                       __global numtyp4 *coeff3,
-                       __global numtyp4 *coeff4,
-                       const int lj_types,
-                       __global numtyp *cutsq,
-                       __global numtyp* sp_lj_in, 
-                       __global int *dev_nbor, __global int *dev_packed, 
-                       __global acctyp4 *ans, __global acctyp *engv, 
-                       const int eflag, const int vflag, const int inum, 
-                       const int nbor_pitch, const int t_per_atom, 
-                       int tablength) {
+__kernel void k_table_bitmap(const __global numtyp4 *x_, 
+                             const __global int *tabindex,
+                             const __global int *nshiftbits, 
+                             const __global int *nmask,
+                             const __global numtyp4* coeff2, 
+                             const __global numtyp4 *coeff3,
+                             const __global numtyp4 *coeff4,
+                             const int lj_types,
+                             const __global numtyp *cutsq,
+                             const __global numtyp* sp_lj_in, 
+                             const __global int *dev_nbor, 
+                             const __global int *dev_packed, 
+                             __global acctyp4 *ans, 
+                             __global acctyp *engv, 
+                             const int eflag, const int vflag, const int inum, 
+                             const int nbor_pitch, const int t_per_atom, 
+                             int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -665,7 +687,7 @@ __kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -732,18 +754,22 @@ __kernel void k_table_bitmap(__global numtyp4 *x_, __global int *tabindex,
   } // if ii
 }
 
-__kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
-                            __global int *nshiftbits, __global int *nmask,
-                            __global numtyp4* coeff2, 
-                            __global numtyp4 *coeff3,
-                            __global numtyp4 *coeff4,
-                            __global numtyp *cutsq_in,
-                            __global numtyp* sp_lj_in, 
-                            __global int *dev_nbor, __global int *dev_packed, 
-                            __global acctyp4 *ans, __global acctyp *engv, 
-                            const int eflag, const int vflag, const int inum, 
-                            const int nbor_pitch, const int t_per_atom, 
-                            int tablength) {
+__kernel void k_table_bitmap_fast(const __global numtyp4 *x_, 
+                                  const __global int *tabindex,
+                                  const __global int *nshiftbits, 
+                                  const __global int *nmask,
+                                  const __global numtyp4* coeff2, 
+                                  const __global numtyp4 *coeff3,
+                                  const __global numtyp4 *coeff4,
+                                  const __global numtyp *cutsq_in,
+                                  const __global numtyp* sp_lj_in, 
+                                  const __global int *dev_nbor, 
+                                  const __global int *dev_packed, 
+                                  __global acctyp4 *ans, 
+                                  __global acctyp *engv, 
+                                  const int eflag, const int vflag, 
+                                  const int inum, const int nbor_pitch, 
+                                  const int t_per_atom, int tablength) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -767,7 +793,7 @@ __kernel void k_table_bitmap_fast(__global numtyp4 *x_, __global int *tabindex,
   int tlm1 = tablength - 1;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index 593d123776..1e644b8a64 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -24,13 +24,16 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
-                          const numtyp kappa, const int lj_types,
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global int *dev_packed, __global acctyp4 *ans,
-                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum,
-                          const int nbor_pitch, const int t_per_atom) {
+__kernel void k_yukawa(const __global numtyp4 *restrict x_, 
+                       const __global numtyp4 *restrict coeff,
+                       const numtyp kappa, const int lj_types,
+                       const __global numtyp *restrict sp_lj_in, 
+                       const __global int *dev_nbor, 
+                       const __global int *dev_packed, 
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv, 
+                       const int eflag, const int vflag, const int inum,
+                       const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
@@ -48,7 +51,7 @@ __kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -105,12 +108,16 @@ __kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff,
   } // if ii
 }
 
-__kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
-                               const numtyp kappa, __global numtyp* sp_lj_in, 
-                               __global int *dev_nbor, __global int *dev_packed, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nbor_pitch, const int t_per_atom) {
+__kernel void k_yukawa_fast(const __global numtyp4 *restrict x_, 
+                            const __global numtyp4 *restrict coeff_in,
+                            const numtyp kappa, 
+                            const __global numtyp *restrict sp_lj_in, 
+                            const __global int *dev_nbor, 
+                            const __global int *dev_packed, 
+                            __global acctyp4 *restrict ans, 
+                            __global acctyp *restrict engv, 
+                            const int eflag, const int vflag, const int inum, 
+                            const int nbor_pitch, const int t_per_atom) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -132,7 +139,7 @@ __kernel void k_yukawa_fast(__global numtyp4 *x_, __global numtyp4 *coeff_in,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp
index ee0571fd3e..41978639e3 100644
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@@ -30,7 +30,7 @@ extern Device<PRECISION,ACC_PRECISION> device;
 
 template <class numtyp, class acctyp>
 YukawaColloidT::YukawaColloid() : BaseAtomic<numtyp,acctyp>(), 
-_allocated(false), _max_rad_size(0) {
+_max_rad_size(0), _allocated(false) {
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index 4eb9bc87b2..55f52e70e7 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -29,12 +29,16 @@ texture<int2> rad_tex;
 #define rad_tex rad_
 #endif
 
-__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
-                               __global numtyp4 *coeff, const int lj_types, 
-                               __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                               __global int *dev_packed, __global acctyp4 *ans,
-                               __global acctyp *engv, const int eflag, 
-                               const int vflag, const int inum,
+__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, 
+                               const __global numtyp *restrict rad_,
+                               const __global numtyp4 *restrict coeff, 
+                               const int lj_types, 
+                               const __global numtyp *restrict sp_lj_in, 
+                               const __global int *dev_nbor, 
+                               const __global int *dev_packed, 
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv, 
+                               const int eflag, const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom,
                                const numtyp kappa) {
   int tid, ii, offset;
@@ -54,7 +58,7 @@ __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
     virial[i]=(acctyp)0;
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);
@@ -113,13 +117,17 @@ __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_,
   } // if ii
 }
 
-__kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
-                                    __global numtyp4 *coeff_in, __global numtyp *sp_lj_in,
-                                    __global int *dev_nbor, __global int *dev_packed, 
-                                    __global acctyp4 *ans, __global acctyp *engv, 
-                                    const int eflag, const int vflag, const int inum, 
-                                    const int nbor_pitch, const int t_per_atom,
-                                    const numtyp kappa) {
+__kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_, 
+                                    const __global numtyp *restrict rad_,
+                                    const __global numtyp4 *restrict coeff_in, 
+                                    const __global numtyp *restrict sp_lj_in,
+                                    const __global int *dev_nbor, 
+                                    const __global int *dev_packed, 
+                                    __global acctyp4 *restrict ans, 
+                                    __global acctyp *restrict engv, 
+                                    const int eflag, const int vflag, 
+                                    const int inum, const int nbor_pitch, 
+                                    const int t_per_atom, const numtyp kappa) {
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
   
@@ -141,7 +149,7 @@ __kernel void k_yukawa_colloid_fast(__global numtyp4 *x_, __global numtyp *rad_,
   __syncthreads();
   
   if (ii<inum) {
-    __global int *nbor, *list_end;
+    const __global int *nbor, *list_end;
     int i, numj, n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,list_end,nbor);