diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index 6076c55283..2b38bd02dc 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -308,7 +308,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
 
 }
 
-#define threebody(delr1x, delr1y, delr1z, eflag, energy)                     \
+#define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy)  \
 {                                                                            \
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
@@ -361,7 +361,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   }                                                                          \
 }
 
-#define threebody_half(delr1x, delr1y, delr1z)                               \
+#define threebody_half(delr1x, delr1y, delr1z, delr2x, delr2y, delr2z)       \
 {                                                                            \
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
@@ -511,7 +511,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
           sw_costheta_ijk=sw3_ijkparam.z;
 
           numtyp fjx, fjy, fjz, fkx, fky, fkz;
-          threebody(delr1x,delr1y,delr1z,eflag,energy);
+          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
 
           f.x -= fjx + fkx;
           f.y -= fjy + fky;
@@ -665,12 +665,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
           sw_costheta_ijk=sw3_ijkparam.z;
 
           numtyp fjx, fjy, fjz;
-          //if (evatom==0) {
-            threebody_half(delr1x,delr1y,delr1z);
-          //} else {
-          //  numtyp fkx, fky, fkz;
-          //  threebody(delr1x,delr1y,delr1z,eflag,energy);
-          //}
+          threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
 
           f.x += fjx;
           f.y += fjy;
@@ -819,7 +814,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
           sw_costheta_ijk=sw3_ijkparam.z;
 
           numtyp fjx, fjy, fjz, fkx, fky, fkz;
-          threebody(delr1x,delr1y,delr1z,eflag,energy);
+          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
 
           f.x += fjx;
           f.y += fjy;
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index 182859bdd4..fa6ae67dfc 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -192,7 +192,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
 
   _allocated=true;
   this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
-    ts4.row_bytes()+cutsq.row_bytes()+
+    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
     map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
   return 0;
 }
@@ -250,10 +250,9 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                                (BX/this->_threads_per_atom)));
 
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
-                 &elem2param, &_nelements, &_nparams,
+  this->k_short_nbor.run(&this->atom->x, &elem2param, &_nelements, &_nparams,
                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                 &this->dev_short_nbor, &ainum,
+                 &this->dev_short_nbor, &_cutshortsq, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
 
   // re-allocate zetaij if necessary
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index c26c17969f..ff8625b99d 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -165,13 +165,12 @@ _texture( ts5_tex,int4);
 #endif
 
 __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq,
-                                   const __global int *restrict map,
                                    const __global int *restrict elem2param,
                                    const int nelements, const int nparams,
                                    const __global int * dev_nbor,
                                    const __global int * dev_packed,
                                    __global int * dev_short_nbor,
+                                   const double _cutshortsq,
                                    const int inum, const int nbor_pitch,
                                    const int t_per_atom) {
   __local int n_stride;
@@ -185,8 +184,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
-    itype=map[itype];
 
     int ncount = 0;
     int m = nbor;
@@ -200,9 +197,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -210,7 +204,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
+      if (rsq<_cutshortsq) {
         dev_short_nbor[nbor_short] = nj;
         nbor_short += n_stride;
         ncount++;
@@ -461,7 +455,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      // rsq<cutsq[ijparam]
+      if (rsq >= cutsq[ijparam]) continue;
+
       numtyp feng[2];
       numtyp ijparam_lam1 = ts1[ijparam].x;
       numtyp4 ts2_ijparam = ts2[ijparam];
@@ -578,6 +573,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
+      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
@@ -725,7 +721,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
-  __local int red_acc[2*BLOCK_PAIR];
+  __local int red_acc[BLOCK_PAIR];
 
   __syncthreads();
 
@@ -759,7 +755,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -806,21 +801,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
-          red_acc[2*m+0] = ijnum;
-          red_acc[2*m+1] = offset_k;
+          red_acc[m] = ijnum;
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      int offset_kf;
-      if (ijnum >= 0) {
-        offset_kf = offset_k;
-      } else {
-        ijnum = red_acc[2*m+0];
-        offset_kf = red_acc[2*m+1];
-      }
+      if (ijnum < 0) ijnum = red_acc[m];
 
       // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
       int idx = ijnum;
@@ -863,7 +851,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
         numtyp4 ts1_param, ts2_param, ts4_param, ts5_param;
@@ -892,6 +879,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
         int idx = nbor_k;
         if (dev_packed==dev_nbor) idx -= n_stride;
+
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -971,7 +959,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
-  __local int red_acc[2*BLOCK_PAIR];
+  __local int red_acc[BLOCK_PAIR];
 
   __syncthreads();
 
@@ -1005,7 +993,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -1052,21 +1039,14 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
-          red_acc[2*m+0] = ijnum;
-          red_acc[2*m+1] = offset_k;
+          red_acc[m] = ijnum;
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      int offset_kf;
-      if (ijnum >= 0) {
-        offset_kf = offset_k;
-      } else {
-        ijnum = red_acc[2*m+0];
-        offset_kf = red_acc[2*m+1];
-      }
+      if (ijnum < 0) ijnum = red_acc[m];
 
       // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
       int idx = ijnum;
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 92db59679e..052167cabd 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -275,10 +275,9 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                                (BX/this->_threads_per_atom)));
 
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
-                 &elem2param, &_nelements, &_nparams,
+  this->k_short_nbor.run(&this->atom->x, &elem2param, &_nelements, &_nparams,
                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                 &this->dev_short_nbor, &ainum,
+                 &this->dev_short_nbor, &_cutshortsq, &ainum,
                  &nbor_pitch, &this->_threads_per_atom);
 
   // re-allocate zetaij if necessary
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index b97f9247d0..1a75cb09e7 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -168,13 +168,12 @@ _texture( ts6_tex,int4);
 #endif
 
 __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq,
-                                   const __global int *restrict map,
                                    const __global int *restrict elem2param,
                                    const int nelements, const int nparams,
                                    const __global int * dev_nbor,
                                    const __global int * dev_packed,
                                    __global int * dev_short_nbor,
+                                   const double _cutshortsq,
                                    const int inum, const int nbor_pitch,
                                    const int t_per_atom) {
   __local int n_stride;
@@ -188,8 +187,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
-    itype=map[itype];
 
     int ncount = 0;
     int m = nbor;
@@ -203,9 +200,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -213,7 +207,7 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
+      if (rsq<_cutshortsq) {
         dev_short_nbor[nbor_short] = nj;
         nbor_short += n_stride;
         ncount++;
@@ -474,7 +468,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      // rsq<cutsq[ijparam]
+      if (rsq >= cutsq[ijparam]) continue;
+
       numtyp feng[2];
       numtyp ijparam_lam1 = ts1[ijparam].x;
       numtyp4 ts2_ijparam = ts2[ijparam];
@@ -594,6 +589,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
+      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
@@ -735,7 +731,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
-  __local int red_acc[2*BLOCK_PAIR];
+  __local int red_acc[BLOCK_PAIR];
 
   __syncthreads();
 
@@ -769,7 +765,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -816,21 +811,14 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
-          red_acc[2*m+0] = ijnum;
-          red_acc[2*m+1] = offset_k;
+          red_acc[m] = ijnum;
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      int offset_kf;
-      if (ijnum >= 0) {
-        offset_kf = offset_k;
-      } else {
-        ijnum = red_acc[2*m+0];
-        offset_kf = red_acc[2*m+1];
-      }
+      if (ijnum < 0) ijnum = red_acc[m];
 
       // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
       int idx = ijnum;
@@ -873,7 +861,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
         numtyp4 ts1_param, ts2_param, ts4_param;
@@ -899,6 +886,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
         int idx = nbor_k;
         if (dev_packed==dev_nbor) idx -= n_stride;
+
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -972,7 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
 
-  __local int red_acc[2*BLOCK_PAIR];
+  __local int red_acc[BLOCK_PAIR];
 
   __syncthreads();
 
@@ -1006,7 +994,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -1053,21 +1040,14 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
-          red_acc[2*m+0] = ijnum;
-          red_acc[2*m+1] = offset_k;
+          red_acc[m] = ijnum;
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      int offset_kf;
-      if (ijnum >= 0) {
-        offset_kf = offset_k;
-      } else {
-        ijnum = red_acc[2*m+0];
-        offset_kf = red_acc[2*m+1];
-      }
+      if (ijnum < 0) ijnum = red_acc[m];
 
       // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
       int idx = ijnum;