From c765861851c3464c9d1c93e90bba8c4e75b28aa0 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 11 Sep 2021 01:00:58 -0500
Subject: [PATCH] Cleaned up and re-arranged the functions to reflect the order
 of calling in a time step

---
 lib/gpu/lal_amoeba.cpp      | 105 +++--
 lib/gpu/lal_amoeba.cu       | 890 +++++++++++++++++-------------------
 lib/gpu/lal_base_amoeba.cpp |  28 +-
 3 files changed, 493 insertions(+), 530 deletions(-)
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 3a83f57594..6f1e0cfaa9 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -126,49 +126,6 @@ double AmoebaT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
 }
 
-// ---------------------------------------------------------------------------
-// Calculate the polar real-space term, returning tep
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int AmoebaT::polar_real(const int eflag, const int vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int _nall=this->atom->nall();
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-
-  // Build the short neighbor list if needed
-  if (!this->short_nbor_avail) {
-    this->k_short_nbor.set_size(GX,BX);
-    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                          &this->_nbor_data->begin(),
-                          &this->dev_short_nbor, &_off2, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
-    this->short_nbor_avail = true;
-  }
-
-  this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->dev_short_nbor,
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,
-                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
-  this->time_pair.stop();
-
-  // Signal that short nbor list is not avail for the next time step
-  //   do it here because polar_real() is the last kernel in a time step at this point
-
-  this->short_nbor_avail = false;
-
-  return GX;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate the real-space permanent field, returning field and fieldp
 // ---------------------------------------------------------------------------
@@ -182,13 +139,13 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
 
-  // Build the short neighbor list if needed
+  // Build the short neighbor list if not done yet
   if (!this->short_nbor_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                          &this->_nbor_data->begin(),
-                          &this->dev_short_nbor, &_off2, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &_off2, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
     this->short_nbor_avail = true;
   }
   
@@ -219,9 +176,20 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
 
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+
   this->k_umutual2b.set_size(GX,BX);
   this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
                         &this->_threads_per_atom, &_aewald, &_off2,
                         &_polar_dscale, &_polar_uscale);
@@ -230,5 +198,48 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Calculate the polar real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::polar_real(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int _nall=this->atom->nall();
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,
+                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_avail = false;
+
+  return GX;
+}
+
 template class Amoeba<PRECISION,ACC_PRECISION>;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index bcb3aef309..fb515c69f7 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -185,6 +185,421 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+    numtyp ddi = damping[itype].z;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+      
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+          damp = pgamma * ucl_powr(r/damp,3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+        
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    numtyp pdi = damping[itype].x;
+    numtyp ddi = damping[itype].z;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_uscale;
+
+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT) 
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_uscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
+      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
+      tdipdip[1] = bcn[1]*xr*yr;
+      tdipdip[2] = bcn[1]*xr*zr;
+      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
+      tdipdip[4] = bcn[1]*yr*zr;
+      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
+
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+      
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
+      
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
 /* ----------------------------------------------------------------------
    polar_real = real-space portion of induced dipole polarization
    adapted from Tinker epreal1d() routine
@@ -311,7 +726,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
+      //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
       
@@ -707,474 +1122,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv);
 }
 
-/* ----------------------------------------------------------------------
-  udirect2b = Ewald real direct field via list
-  udirect2b computes the real space contribution of the permanent
-   atomic multipole moments to the field via a neighbor list
-------------------------------------------------------------------------- */
-
-__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global numtyp4 *restrict fieldp,
-                                 const int inum,  const int nall,
-                                 const int nbor_pitch, const int t_per_atom,
-                                 const numtyp aewald, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
-{
-  int tid, ii, offset, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  int n_stride;
-  local_allocate_store_charge();
-
-  acctyp _fieldp[6];
-  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
-
-  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-
-  //numtyp4 xi__;
-
-  if (ii<inum) {
-    int numj, nbor, nbor_end;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
-    int itype,igroup;
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
-    
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
-    
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
-    numtyp ddi = damping[itype].z;
-
-    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
-    numtyp aesq2n = (numtyp)0.0;
-    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int jextra=nbor_mem[nbor];
-      int j = jextra & NEIGHMASK15;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
- 
-      // Compute r12
-      numtyp xr = jx.x - ix.x;
-      numtyp yr = jx.y - ix.y;
-      numtyp zr = jx.z - ix.z;
-      numtyp r2 = xr*xr + yr*yr + zr*zr;
-
-      if (r2>off2) {
-        if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj);
-        continue;
-      }
-      numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
-      numtyp r2inv = rinv*rinv;
-      numtyp rr1 = rinv;
-      numtyp rr3 = rr1 * r2inv;
-      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
-      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
-
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
-      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
-      } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = (numtyp)1.0;
-      }
-
-      // intermediates involving moments and separation distance
-
-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
-      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
-      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
-      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
-      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
-      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-
-      // calculate the real space Ewald error function terms
-
-      numtyp ralpha = aewald * r;
-      numtyp exp2a = ucl_exp(-ralpha*ralpha);
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
-      bn[0] = _erfc * rinv;
-
-      numtyp aefac = aesq2n;
-      for (int m = 1; m <= 3; m++) {
-        numtyp bfac = (numtyp) (m+m-1);
-        aefac = aesq2 * aefac;
-        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
-      }
-
-      // find the field components for Thole polarization damping
-
-      numtyp scale3 = (numtyp)1.0;
-      numtyp scale5 = (numtyp)1.0;
-      numtyp scale7 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
-      if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
-        if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp) ;
-            scale3 = (numtyp)1.0 - expdamp ;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
-          }
-        } else {
-          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
-          damp = pgamma * ucl_powr(r/damp,3.0);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp);
-            scale3 = (numtyp)1.0 - expdamp;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
-          }
-        }
-      } else { // damp == 0: ???
-      }
-
-      numtyp scalek = factor_dscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-        
-      scalek = factor_pscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-
-      _fieldp[0] += fid[0];
-      _fieldp[1] += fid[1];
-      _fieldp[2] += fid[2];
-      _fieldp[3] += fip[0];
-      _fieldp[4] += fip[1];
-      _fieldp[5] += fip[2];
-    }  // nbor
-
-  } // ii<inum
-
-  // accumulate field and fieldp
-  
-  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
-}
-
-/* ----------------------------------------------------------------------
-  umutual2b = Ewald real mutual field via list
-   umutual2b computes the real space contribution of the induced
-   atomic dipole moments to the field via a neighbor list
-------------------------------------------------------------------------- */
-
-__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 __global numtyp4 *restrict fieldp,
-                                 const int inum,  const int nall,
-                                 const int nbor_pitch, const int t_per_atom,
-                                 const numtyp aewald, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
-{
-  int tid, ii, offset, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  int n_stride;
-  local_allocate_store_charge();
-
-  acctyp _fieldp[6];
-  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
-
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
-  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
-
-  //numtyp4 xi__;
-
-  if (ii<inum) {
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
-
-    int itype,igroup;
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
-    
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
-    
-    numtyp pdi = damping[itype].x;
-    numtyp ddi = damping[itype].z;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int jextra=dev_packed[nbor];
-      int j = jextra & NEIGHMASK15;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
- 
-      // Compute r12
-      numtyp xr = jx.x - ix.x;
-      numtyp yr = jx.y - ix.y;
-      numtyp zr = jx.z - ix.z;
-      numtyp r2 = xr*xr + yr*yr + zr*zr;
-
-      if (r2>off2) continue;
-  
-      numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
-      numtyp r2inv = rinv*rinv;
-      numtyp rr1 = rinv;
-      numtyp rr3 = rr1 * r2inv;
-      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
-
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-      numtyp ukx = polar4[j].x;  // uind[j][0];
-      numtyp uky = polar4[j].y;  // uind[j][1];
-      numtyp ukz = polar4[j].z;  // uind[j][2];
-      numtyp ukxp = polar5[j].x; // uinp[j][0];
-      numtyp ukyp = polar5[j].y; // uinp[j][1];
-      numtyp ukzp = polar5[j].z; // uinp[j][2];
-
-      numtyp factor_uscale;
-
-      // find terms needed later to compute mutual polarization
-      // if (poltyp != DIRECT) 
-      numtyp scale3 = (numtyp)1.0;
-      numtyp scale5 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
-      if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
-        if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp);
-            scale3 = (numtyp)1.0 - expdamp;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-          }
-        }
-      } else { // damp == 0: ???
-      }
-
-      numtyp scalek = factor_uscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
-      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
-      tdipdip[1] = bcn[1]*xr*yr;
-      tdipdip[2] = bcn[1]*xr*zr;
-      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
-      tdipdip[4] = bcn[1]*yr*zr;
-      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
-
-      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
-      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
-      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
-      
-      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
-      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
-      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
-      
-      _fieldp[0] += fid[0];
-      _fieldp[1] += fid[1];
-      _fieldp[2] += fid[2];
-      _fieldp[3] += fip[0];
-      _fieldp[4] += fip[1];
-      _fieldp[5] += fip[2];
-    }  // nbor
-
-  } // ii<inum
-
-  // accumulate field and fieldp
-  
-  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
-}
-
-/* ----------------------------------------------------------------------
-   scan standard neighbor list and make it compatible with 1-5 neighbors
-   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
-   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
-   else do nothing to IJ entry
-------------------------------------------------------------------------- */
-
-__kernel void k_special15(__global int * dev_nbor,
-                          const __global int * dev_packed,
-                          const __global tagint *restrict tag,
-                          const __global int *restrict nspecial15,
-                          const __global tagint *restrict special15,
-                          const int inum, const int nall, const int nbor_pitch,
-                          const int t_per_atom) {
-  int tid, ii, offset, n_stride, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  if (ii<inum) {
-  
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    int n15 = nspecial15[ii];
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int sj=dev_packed[nbor];
-      int which = sj >> SBBITS & 3;
-      int j = sj & NEIGHMASK;
-      tagint jtag = tag[j];
-
-      if (!which) {
-        int offset=ii;
-        for (int k=0; k<n15; k++) {
-          if (special15[offset] == jtag) {
-            which = 4;
-            break;
-          }
-          offset += nall;
-        }
-      }
-
-      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
-    } // for nbor
-
-  } // if ii
-}
-/*
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
-                                   const numtyp off2, __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
-  int tid, ii, offset, n_stride, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  int new_numj=0;
-
-  if (ii<inum) {
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    if (i == 0) printf("i = 0: numj before = %d\n", numj);
-    __global int *out_list=dev_nbor+nbor;
-    const int out_stride=n_stride;
-    
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-    
-      int sj=dev_packed[nbor];
-      int j = sj & NEIGHMASK15;
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<=off2) {
-        *out_list=sj;
-        out_list+=out_stride;
-
-        new_numj++;
-        if (i == 0 && offset == 0) printf("neighbor of i = 0 within  off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
-      } else {
-        if (i == 0 && offset == 0) printf("neighbor of i = 0 outside off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
-      }
-    } // for nbor
-  } // if ii
-
-  if (t_per_atom>1) {
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1)
-      new_numj += shfl_down(new_numj, s, t_per_atom);
-  }
-  if (offset==0 && ii<inum) {
-    dev_nbor[ii+nbor_pitch]=new_numj;
-    if (i == 0) printf("i = 0: numj after = %d\n", new_numj);
-  }
-}
-*/
-__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp off2,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
+                                  const __global int * dev_nbor,
+                                  const __global int * dev_packed,
+                                  __global int * dev_short_nbor,
+                                  const numtyp off2,
+                                  const int inum, const int nbor_pitch,
+                                  const int t_per_atom) {
   __local int n_stride;
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 81b2527222..d06d7dfa57 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -101,9 +101,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_nbor);
   }
 
-  bool allocate_packed = false;
+  bool alloc_packed=false;
   success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
-                              _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom);
+                              _gpu_host,max_nbors,cell_size,alloc_packed,_threads_per_atom);
   if (success!=0)
     return success;
                               
@@ -231,8 +231,6 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
     add_onefive_neighbors();
   }
 
-  //nbor->copy_unpacked(inum,mn);
-
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
@@ -336,17 +334,17 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
 
 template <class numtyp, class acctyp>
 int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **&ilist, int **&jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd) {
+                              double **host_x, int *host_type, int *host_amtype,
+                              int *host_amgroup, double **host_rpole,
+                              double **host_uind, double **host_uinp,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              int *nspecial15, tagint **special15,
+                              const bool eflag_in, const bool vflag_in,
+                              const bool eatom, const bool vatom, int &host_start,
+                              int **&ilist, int **&jnum, const double cpu_time,
+                              bool &success, double *host_q, double *boxlo,
+                              double *prd) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;