From 0425dd037e7b03a2d61b9ba45321ba2a0e882d4b Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Tue, 24 May 2011 20:15:04 -0400
Subject: [PATCH] Removing nall from kernel calls. Changing ellipsoid neighbor
 calls to use a correct block size.

---
 lib/gpu/base_ellipsoid.cpp       |  5 ++---
 lib/gpu/cmm_cut_gpu_kernel.cu    |  4 ++--
 lib/gpu/cmm_cut_gpu_memory.cpp   |  6 ++----
 lib/gpu/cmmc_long_gpu_kernel.cu  |  9 ++++-----
 lib/gpu/cmmc_long_gpu_memory.cpp |  5 ++---
 lib/gpu/cmmc_msm_gpu_kernel.cu   |  9 ++++-----
 lib/gpu/cmmc_msm_gpu_memory.cpp  |  8 +++-----
 lib/gpu/crml_gpu_kernel.cu       | 13 ++++++-------
 lib/gpu/crml_gpu_memory.cpp      | 10 ++++------
 lib/gpu/ellipsoid_nbor.cu        | 14 ++++++++------
 lib/gpu/gayberne.cpp             | 27 +++++++++++++++------------
 lib/gpu/gayberne.cu              |  2 +-
 lib/gpu/gayberne_lj.cu           |  6 +++---
 lib/gpu/lj96_cut_gpu_kernel.cu   |  5 ++---
 lib/gpu/lj96_cut_gpu_memory.cpp  |  6 ++----
 lib/gpu/lj_class2_long.cpp       |  9 +++------
 lib/gpu/lj_class2_long.cu        |  9 ++++-----
 lib/gpu/lj_cut_gpu_kernel.cu     |  5 ++---
 lib/gpu/lj_cut_gpu_memory.cpp    |  6 ++----
 lib/gpu/lj_expand_gpu_kernel.cu  |  5 ++---
 lib/gpu/lj_expand_gpu_memory.cpp |  6 ++----
 lib/gpu/ljc_cut_gpu_kernel.cu    |  8 ++++----
 lib/gpu/ljc_cut_gpu_memory.cpp   |  5 ++---
 lib/gpu/ljcl_cut_gpu_kernel.cu   |  9 ++++-----
 lib/gpu/ljcl_cut_gpu_memory.cpp  | 12 +++++-------
 lib/gpu/morse_gpu_kernel.cu      |  5 ++---
 lib/gpu/morse_gpu_memory.cpp     |  6 ++----
 lib/gpu/pair_gpu_build_kernel.cu |  2 +-
 lib/gpu/pair_gpu_nbor.cpp        |  2 +-
 lib/gpu/re_squared.cpp           | 31 +++++++++++++++++--------------
 lib/gpu/re_squared.cu            |  2 +-
 lib/gpu/re_squared_lj.cu         |  9 ++++-----
 32 files changed, 118 insertions(+), 142 deletions(-)

diff --git a/lib/gpu/base_ellipsoid.cpp b/lib/gpu/base_ellipsoid.cpp
index 55a4f26d02..45477662f6 100644
--- a/lib/gpu/base_ellipsoid.cpp
+++ b/lib/gpu/base_ellipsoid.cpp
@@ -236,17 +236,16 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
                                 const int form_high, const bool shared_types,
                                 int ntypes) {
   int stride=nbor->nbor_pitch();
-  int anall=atom->nall();
   if (shared_types) {
     k_nbor_fast.set_size(GX,BX);
     k_nbor_fast.run(&atom->dev_x.begin(), &cut_form.begin(), 
                     &nbor->dev_nbor.begin(), &stride, &start, &inum,
-                    &nbor->dev_packed.begin(), &form_low, &form_high, &anall);
+                    &nbor->dev_packed.begin(), &form_low, &form_high);
   } else {
     k_nbor.set_size(GX,BX);
     k_nbor.run(&atom->dev_x.begin(), &cut_form.begin(), &ntypes,
                &nbor->dev_nbor.begin(), &stride, &start, &inum, 
-               &nbor->dev_packed.begin(), &form_low, &form_high, &anall);
+               &nbor->dev_packed.begin(), &form_low, &form_high);
   }
 }
 
diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu
index f99e7f06ac..a5521f516b 100644
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@@ -241,7 +241,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp* sp_lj_in,__global int *dev_nbor,
                                __global int *dev_packed, __global acctyp4 *ans,
                                __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum, const int nall,
+                               const int vflag, const int inum,
                                const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp
index 8a5949c9e7..8d18c99d7c 100644
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@@ -130,7 +130,6 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -141,15 +140,14 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index a47a9267a1..39ae01cb78 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -102,7 +102,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, __global numtyp *q_ ,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
@@ -295,10 +295,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald,
-                               const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp
index e2f99fceca..c6291982d6 100644
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@@ -141,7 +141,6 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -152,7 +151,7 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
+                          &ainum, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   } else {
@@ -161,7 +160,7 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, 
                      &this->_threads_per_atom);
   }
diff --git a/lib/gpu/cmmc_msm_gpu_kernel.cu b/lib/gpu/cmmc_msm_gpu_kernel.cu
index 3ee88d2a3d..09fad801eb 100644
--- a/lib/gpu/cmmc_msm_gpu_kernel.cu
+++ b/lib/gpu/cmmc_msm_gpu_kernel.cu
@@ -94,7 +94,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, __global numtyp *q_,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const int smooth, const int t_per_atom) {
@@ -314,10 +314,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const int smooth,
-                               const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const int smooth, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/cmmc_msm_gpu_memory.cpp b/lib/gpu/cmmc_msm_gpu_memory.cpp
index ca051d4803..22d69a33e2 100644
--- a/lib/gpu/cmmc_msm_gpu_memory.cpp
+++ b/lib/gpu/cmmc_msm_gpu_memory.cpp
@@ -141,7 +141,6 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -152,7 +151,7 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
+                          &ainum, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_smooth, &this->_threads_per_atom);
   } else {
@@ -161,9 +160,8 @@ void CMMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_smooth, 
-                     &this->_threads_per_atom);
+                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+                     &_qqrd2e, &_smooth, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu
index dfdc7af3cd..f2ba74c8b6 100644
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@@ -103,12 +103,11 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global int *dev_nbor, __global int *dev_packed,
                           __global acctyp4 *ans, __global acctyp *engv, 
                           const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_, const numtyp cut_coulsq,
-                          const numtyp qqrd2e, const numtyp g_ewald,
-                          const numtyp denom_lj, const numtyp cut_bothsq, 
-                          const numtyp cut_ljsq, const numtyp cut_lj_innersq,
-                          const int t_per_atom) {
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const numtyp denom_lj,
+                          const numtyp cut_bothsq, const numtyp cut_ljsq,
+                          const numtyp cut_lj_innersq, const int t_per_atom) {
 
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@@ -300,7 +299,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                                __global numtyp* sp_lj_in, __global int *dev_nbor, 
                                __global int *dev_packed, __global acctyp4 *ans,
                                __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum, const int nall,
+                               const int vflag, const int inum,
                                const int nbor_pitch, __global numtyp *q_,
                                const numtyp cut_coulsq, const numtyp qqrd2e,
                                const numtyp g_ewald, const numtyp denom_lj,
diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp
index 6661f67585..9c41cced1a 100644
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@@ -145,7 +145,6 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -155,10 +154,9 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
-                          &_cut_ljsq, &_cut_lj_innersq, 
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
                           &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
@@ -166,7 +164,7 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                      &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                      &this->_threads_per_atom);
diff --git a/lib/gpu/ellipsoid_nbor.cu b/lib/gpu/ellipsoid_nbor.cu
index 67f9e631b4..3a83b65666 100644
--- a/lib/gpu/ellipsoid_nbor.cu
+++ b/lib/gpu/ellipsoid_nbor.cu
@@ -42,6 +42,10 @@
 #define numtyp4 float4
 #endif
 
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
 // ---------------------------------------------------------------------------
 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
 // -- Only unpack neighbors matching the specified inclusive range of forms
@@ -51,7 +55,7 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
                           const int ntypes, __global int *dev_nbor,
                           const int nbor_pitch, const int start, const int inum, 
                           __global int *dev_ij, const int form_low, 
-                          const int form_high, const int nall) {
+                          const int form_high) {
                                 
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X+start;
@@ -71,8 +75,7 @@ __kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form,
     int newj=0;  
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
       int j=*nbor;
-      if (j>=nall)
-        j%=nall;
+      j &= NEIGHMASK;
       numtyp4 jx=x_[j];
       int jtype=jx.w;
       int mtype=itype+jtype;
@@ -107,7 +110,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
                                __global int *dev_nbor, const int nbor_pitch, 
                                const int start, const int inum, 
                                __global int *dev_ij, const int form_low, 
-                               const int form_high, const int nall) {
+                               const int form_high) {
                                 
   int ii=THREAD_ID_X;
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -135,8 +138,7 @@ __kernel void kernel_nbor_fast(__global numtyp4 *x_, __global numtyp2 *cut_form,
     int newj=0;  
     for ( ; nbor<list_end; nbor+=nbor_pitch) {
       int j=*nbor;
-      if (j>=nall)
-        j%=nall;
+      j &= NEIGHMASK;
       numtyp4 jx=x_[j];
       int jtype=jx.w;
       int mtype=itype+jtype;
diff --git a/lib/gpu/gayberne.cpp b/lib/gpu/gayberne.cpp
index d6299d9f94..643e86cd31 100644
--- a/lib/gpu/gayberne.cpp
+++ b/lib/gpu/gayberne.cpp
@@ -191,11 +191,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
+  int GX, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
 
   if (this->_multiple_forms) {
     this->time_nbor1.start();
@@ -203,7 +201,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
       GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
                                (BX/this->_threads_per_atom)));
-      this->pack_nbors(GX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
+      NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
 			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor1.stop();
 
@@ -214,7 +213,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
        &this->gamma_upsilon_mu.begin(), &this->sigma_epsilon.begin(), 
        &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
        &stride, &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, &anall,
+       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
        &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
@@ -234,7 +233,9 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
                                this->_last_ellipse)/
                                (BX/this->_threads_per_atom)));
-      this->pack_nbors(GX,BX,this->_last_ellipse,this->ans->inum(),
+      NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                                this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
 			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor2.stop();
 
@@ -246,7 +247,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
         &this->sigma_epsilon.begin(), &this->_lj_types, &this->lshape.begin(), 
         &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
         &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-        &vflag, &this->_last_ellipse, &ainum, &anall, &this->_threads_per_atom);
+        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
    } else {
       this->ans->dev_ans.zero();
@@ -269,7 +270,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
           &this->lj3.begin(), &this->gamma_upsilon_mu.begin(), &stride,
           &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
           &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-          &eflag, &vflag, &this->_last_ellipse, &ainum, &anall,
+          &eflag, &vflag, &this->_last_ellipse, &ainum,
           &this->_threads_per_atom);
       } else {
         this->k_lj.set_size(GX,BX);
@@ -277,14 +278,16 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
           &this->lj3.begin(), &this->_lj_types, &this->gamma_upsilon_mu.begin(),
           &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
           &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-          &vflag, &this->_last_ellipse, &ainum, &anall, 
-          &this->_threads_per_atom);
+          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
       }
     }
     this->time_lj.stop();
   } else {
+    GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                             (BX/this->_threads_per_atom)));
+    NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
     this->time_nbor1.start();
-    this->pack_nbors(GX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
+    this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
 		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
     this->time_ellipsoid.start(); 
@@ -295,7 +298,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       &this->_lj_types, &this->lshape.begin(), &this->nbor->dev_nbor.begin(),
       &stride, &this->ans->dev_ans.begin(), &ainum, 
       &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-      &eflag, &vflag, &ainum, &anall, &this->_threads_per_atom);
+      &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
 }
diff --git a/lib/gpu/gayberne.cu b/lib/gpu/gayberne.cu
index 70f5ca8672..8964dd284c 100644
--- a/lib/gpu/gayberne.cu
+++ b/lib/gpu/gayberne.cu
@@ -95,7 +95,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
                                __global acctyp4 *ans, const int astride, 
                                __global acctyp *engv, __global int *err_flag, 
                                const int eflag, const int vflag, const int inum,
-                               const int nall, const int t_per_atom) {
+                               const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/gayberne_lj.cu b/lib/gpu/gayberne_lj.cu
index 0f014cac4c..4bd3a1f82a 100644
--- a/lib/gpu/gayberne_lj.cu
+++ b/lib/gpu/gayberne_lj.cu
@@ -32,7 +32,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                __global int *err_flag, const int eflag, 
                                const int vflag,const int start, const int inum, 
-                               const int nall, const int t_per_atom) {
+                               const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom+start;
@@ -306,7 +306,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                         __global acctyp4 *ans, __global acctyp *engv, 
                         __global int *err_flag, const int eflag, 
                         const int vflag, const int start, const int inum, 
-                        const int nall, const int t_per_atom) {
+                        const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom+start;
@@ -448,7 +448,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                              __global acctyp4 *ans, __global acctyp *engv,
                              __global int *err_flag, const int eflag,
                              const int vflag, const int start, const int inum,
-                             const int nall, const int t_per_atom) {
+                             const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom+start;
diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu
index 1de9a8a7bf..6442089193 100644
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@@ -234,8 +234,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp
index 0b066c0973..4cc61a5d03 100644
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@@ -130,7 +130,6 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -141,15 +140,14 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lj_class2_long.cpp b/lib/gpu/lj_class2_long.cpp
index f70945690d..b7c0210e9f 100644
--- a/lib/gpu/lj_class2_long.cpp
+++ b/lib/gpu/lj_class2_long.cpp
@@ -139,7 +139,6 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -150,8 +149,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
+                          &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
@@ -159,9 +157,8 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
-                     &this->_threads_per_atom);
+                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lj_class2_long.cu b/lib/gpu/lj_class2_long.cu
index 2b113c23af..8225ec1a9e 100644
--- a/lib/gpu/lj_class2_long.cu
+++ b/lib/gpu/lj_class2_long.cu
@@ -100,7 +100,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, __global numtyp *q_,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
@@ -284,10 +284,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald,
-                               const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu
index 9ef698cd09..42073f983d 100644
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@@ -233,8 +233,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed, 
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp
index a294eb647f..40622c576c 100644
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@@ -130,7 +130,6 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -141,15 +140,14 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lj_expand_gpu_kernel.cu b/lib/gpu/lj_expand_gpu_kernel.cu
index 26fbefacf8..1a63ef803c 100644
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@@ -236,8 +236,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/lj_expand_gpu_memory.cpp b/lib/gpu/lj_expand_gpu_memory.cpp
index fe5bf0b513..714aa7c77f 100644
--- a/lib/gpu/lj_expand_gpu_memory.cpp
+++ b/lib/gpu/lj_expand_gpu_memory.cpp
@@ -130,7 +130,6 @@ void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -141,15 +140,14 @@ void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu
index ad1e530712..ca37e1235f 100644
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@@ -94,7 +94,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, __global numtyp *q_ ,
                           __global numtyp *cutsq, const numtyp qqrd2e,
                           const int t_per_atom) {
@@ -270,9 +270,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , __global numtyp *_cutsq,
-                               const numtyp qqrd2e, const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               __global numtyp *_cutsq, const numtyp qqrd2e,
+                               const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp
index 642ff6ecc7..df089a626a 100644
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@@ -142,7 +142,6 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -153,7 +152,7 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
+                          &ainum, &nbor_pitch,
                           &this->atom->dev_q.begin(), &cutsq.begin(),
                           &_qqrd2e, &this->_threads_per_atom);
   } else {
@@ -162,7 +161,7 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &nbor_pitch, &this->atom->dev_q.begin(),
                      &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index ddde1dec32..e177f1a0fd 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -102,7 +102,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, __global numtyp *q_,
                           const numtyp cut_coulsq, const numtyp qqrd2e,
                           const numtyp g_ewald, const int t_per_atom) {
@@ -286,10 +286,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald,
-                               const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp
index f37e6b1857..f36d2a68c2 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@@ -140,7 +140,6 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -151,18 +150,17 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
-                     &this->_threads_per_atom);
+                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/morse_gpu_kernel.cu b/lib/gpu/morse_gpu_kernel.cu
index 8832f58c64..d4cf1c325c 100644
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
                           __global int *dev_packed, __global acctyp4 *ans,
                           __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                           const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@@ -234,8 +234,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
                                __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/morse_gpu_memory.cpp b/lib/gpu/morse_gpu_memory.cpp
index f146b39215..d188683915 100644
--- a/lib/gpu/morse_gpu_memory.cpp
+++ b/lib/gpu/morse_gpu_memory.cpp
@@ -129,7 +129,6 @@ void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                                (BX/this->_threads_per_atom)));
 
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
@@ -140,15 +139,14 @@ void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                           &this->_nbor_data->begin(),
                           &this->ans->dev_ans.begin(),
                           &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
                      &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                      &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                      &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu
index 33742a4cba..d1ed9fd743 100644
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@@ -253,7 +253,7 @@ __kernel void kernel_special(__global int *dev_nbor,
                              __global int *host_nbor_list, 
                              __global int *host_numj, __global int *tag,
                              __global int *nspecial, __global int *special,
-                             int inum, int nt, int nall, int max_nbors) {
+                             int inum, int nt, int max_nbors) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
 
diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp
index df138a7eff..499f3c132c 100644
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@@ -389,7 +389,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
                            &dev_host_numj.begin(), &atom.dev_tag.begin(), 
                            &dev_nspecial.begin(), &dev_special.begin(), 
-                           &inum, &nt, &nall, &_max_nbors);
+                           &inum, &nt, &_max_nbors);
   }
   time_kernel.stop();
 
diff --git a/lib/gpu/re_squared.cpp b/lib/gpu/re_squared.cpp
index 19dd077fa7..b27b6944ec 100644
--- a/lib/gpu/re_squared.cpp
+++ b/lib/gpu/re_squared.cpp
@@ -179,11 +179,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
+  int GX, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
-  int anall=this->atom->nall();
 
   if (this->_multiple_forms) {
     if (this->_last_ellipse>0) {
@@ -191,7 +189,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor1.start();
       GX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/
                                (BX/this->_threads_per_atom)));
-      this->pack_nbors(GX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
+      NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
 			                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor1.stop();
 
@@ -202,13 +201,13 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
        &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
        &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
        &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, &anall,
+       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
        &this->_threads_per_atom);
       this->time_ellipsoid.stop();
 
       // ------------ ELLIPSE_SPHERE ---------------
       this->time_nbor2.start();
-      this->pack_nbors(GX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
+      this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
 			                 ELLIPSE_SPHERE,_shared_types,_lj_types);
       this->time_nbor2.stop();
 
@@ -219,7 +218,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
        &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
        &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
        &this->ans->dev_ans.begin(),&ainum,&this->ans->dev_engv.begin(),
-       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse, &anall,
+       &this->dev_error.begin(), &eflag, &vflag, &this->_last_ellipse,
        &this->_threads_per_atom);
       this->time_ellipsoid2.stop();
 
@@ -236,7 +235,9 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
                                this->_last_ellipse)/
                                (BX/this->_threads_per_atom)));
-      this->pack_nbors(GX,BX,this->_last_ellipse,this->ans->inum(),
+      NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
+                               this->_last_ellipse)/BX));
+      this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
 			                 SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
       this->time_nbor3.stop();
 
@@ -248,7 +249,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
         &this->sigma_epsilon.begin(), &this->_lj_types,
         &this->nbor->dev_nbor.begin(), &stride, &this->ans->dev_ans.begin(),
         &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-        &vflag, &this->_last_ellipse, &ainum, &anall, &this->_threads_per_atom);
+        &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
       this->time_ellipsoid3.stop();
    } else {
       this->ans->dev_ans.zero();
@@ -270,7 +271,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
           &this->lj3.begin(), &this->special_lj.begin(), &stride,
           &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
           &this->ans->dev_engv.begin(), &this->dev_error.begin(),
-          &eflag, &vflag, &this->_last_ellipse, &ainum, &anall,
+          &eflag, &vflag, &this->_last_ellipse, &ainum,
           &this->_threads_per_atom);
       } else {
         this->k_lj.set_size(GX,BX);
@@ -278,14 +279,16 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
           &this->lj3.begin(), &this->_lj_types, &this->special_lj.begin(),
           &stride, &this->nbor->dev_packed.begin(), &this->ans->dev_ans.begin(),
           &this->ans->dev_engv.begin(), &this->dev_error.begin(), &eflag,
-          &vflag, &this->_last_ellipse, &ainum, &anall, 
-          &this->_threads_per_atom);
+          &vflag, &this->_last_ellipse, &ainum, &this->_threads_per_atom);
       }
     }
     this->time_lj.stop();
   } else {
+    GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                             (BX/this->_threads_per_atom)));
+    NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
     this->time_nbor1.start();
-    this->pack_nbors(GX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
+    this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
 		                 ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
     this->time_ellipsoid.start(); 
@@ -295,7 +298,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       &this->special_lj.begin(), &this->sigma_epsilon.begin(), 
       &this->_lj_types, &this->nbor->dev_nbor.begin(), &stride,
       &this->ans->dev_ans.begin(), &ainum,  &this->ans->dev_engv.begin(),
-      &this->dev_error.begin(), &eflag, &vflag, &ainum, &anall,
+      &this->dev_error.begin(), &eflag, &vflag, &ainum, 
       &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
diff --git a/lib/gpu/re_squared.cu b/lib/gpu/re_squared.cu
index a9484a9141..d91a04f5a2 100644
--- a/lib/gpu/re_squared.cu
+++ b/lib/gpu/re_squared.cu
@@ -47,7 +47,7 @@ __kernel void kernel_ellipsoid(__global numtyp4* x_,__global numtyp4 *q,
                                const int astride, __global acctyp *engv,
                                __global int *err_flag, const int eflag,
                                const int vflag, const int inum,
-                               const int nall, const int t_per_atom) {
+                               const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
diff --git a/lib/gpu/re_squared_lj.cu b/lib/gpu/re_squared_lj.cu
index 97045aa723..784dbe63e7 100644
--- a/lib/gpu/re_squared_lj.cu
+++ b/lib/gpu/re_squared_lj.cu
@@ -32,7 +32,7 @@ __kernel void kernel_ellipsoid_sphere(__global numtyp4* x_,__global numtyp4 *q,
                    __global acctyp4 *ans, const int astride, 
                    __global acctyp *engv, __global int *err_flag, 
                    const int eflag, const int vflag, const int inum,
-                   const int nall, const int t_per_atom) {
+                   const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom;
@@ -353,8 +353,7 @@ __kernel void kernel_sphere_ellipsoid(__global numtyp4 *x_,__global numtyp4 *q,
                                const int stride, __global acctyp4 *ans,
                                __global acctyp *engv, __global int *err_flag,
                                const int eflag, const int vflag,const int start,
-                               const int inum, const int nall,
-                               const int t_per_atom) {
+                               const int inum, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom+start;
@@ -605,7 +604,7 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                         __global acctyp4 *ans, __global acctyp *engv, 
                         __global int *err_flag, const int eflag, 
                         const int vflag, const int start, const int inum, 
-                        const int nall, const int t_per_atom) {
+                        const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom+start;
@@ -747,7 +746,7 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                              __global acctyp4 *ans, __global acctyp *engv,
                              __global int *err_flag, const int eflag,
                              const int vflag, const int start, const int inum,
-                             const int nall, const int t_per_atom) {
+                             const int t_per_atom) {
   int tid=THREAD_ID_X;
   int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
   ii+=tid/t_per_atom+start;