diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu
index efc6dbbd6a..47504f621e 100644
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@@ -75,6 +75,10 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
@@ -114,12 +118,9 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -227,12 +228,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index a43d73ffcf..4a19b5fe03 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -93,6 +93,10 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
@@ -139,14 +143,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)0.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -282,14 +282,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)0.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu
index 47dad50753..6ba6eaedca 100644
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@@ -93,6 +93,10 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
@@ -142,14 +146,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)0.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -287,14 +287,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)0.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -390,4 +386,3 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
 }
 
 #endif
-
diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu
index 347e6ede35..b8d06ec6da 100644
--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@@ -22,6 +22,10 @@
 #include "gb_gpu_extra.h"
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
                                  numtyp ans[9])
 {
@@ -142,12 +146,9 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
   for ( ; nbor<nbor_end; nbor+=stride) {
 
   int j=*nbor;
-  if (j < nall) 
-    factor_lj = (numtyp)1.0;
-  else {
-    factor_lj = sp_lj[j/nall];
-    j %= nall;
-  }
+  factor_lj = sp_lj[sbmask(j)];
+  j &= NEIGHMASK;
+
   numtyp4 jx=x_[j];
   int jtype=jx.w;
 
diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu
index 72ce6684ab..3e42cbcbbc 100644
--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@@ -22,6 +22,10 @@
 #include "gb_gpu_extra.h"
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
                                __global numtyp4* shape,__global numtyp4* well, 
                                __global numtyp *gum, __global numtyp2* sig_eps, 
@@ -68,12 +72,9 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
     for ( ; nbor<nbor_end; nbor+=stride) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=x_[j];
       int jtype=jx.w;
 
@@ -299,12 +300,9 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
     for ( ; nbor<list_end; nbor+=stride) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=x_[j];
       int jtype=jx.w;
 
@@ -409,12 +407,9 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     for ( ; nbor<list_end; nbor+=stride) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=x_[j];
       int mtype=itype+jx.w;
 
diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu
index 083060a7a1..0d3a01fbac 100644
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@@ -75,6 +75,10 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
@@ -114,12 +118,9 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -220,12 +221,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu
index 25d00a202a..0e72e41f36 100644
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@@ -75,6 +75,10 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
@@ -114,12 +118,9 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -219,12 +220,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
   
       int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu
index fac49b87ea..2751e20702 100644
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@@ -85,6 +85,10 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
@@ -131,14 +135,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)1.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -260,14 +260,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)1.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index be2ae069e4..a0b27f0259 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -93,6 +93,10 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
@@ -139,14 +143,10 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)0.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int jtype=jx.w;
 
@@ -274,14 +274,10 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
-      if (j < nall) {
-        factor_lj = (numtyp)1.0;
-        factor_coul = (numtyp)0.0;
-      } else {
-        factor_lj = sp_lj[j/nall];
-        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
-        j %= nall;
-      }
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
       numtyp4 jx=fetch_pos(j,x_); //x_[j];
       int mtype=itype+jx.w;
 
diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu
index a08a26800a..bcf41c0050 100644
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@@ -57,6 +57,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define CELL_BLOCK_SIZE 64
 #define BLOCK_2D 8
 
+#define SBBITS 30
+
 __kernel void transpose(int *out, int *in, int columns_in, int rows_in)
 {
 	__local float block[BLOCK_2D][BLOCK_2D+1];
@@ -279,16 +281,16 @@ __kernel void kernel_special(__global int *dev_nbor,
       int offset=ii;
       for (int i=0; i<n3; i++) {
         if (special[offset]==jtag) {
-          nbor+=nall;
+          int which = 1;
           if (i>=n1)
-            nbor+=nall;
+            which++;
           if (i>=n2)
-            nbor+=nall;
+            which++;
+          nbor=nbor ^ (which << SBBITS);
+          *list=nbor;
         }
         offset+=nt;
       }
-      if (nbor>=nall)
-        *list=nbor;
     }
   } // if ii
 }