diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 8bcbd6c4cb..08b3f1c9a5 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -59,7 +59,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
                             "k_amoeba_polar", "k_amoeba_udirect2b",
-                            "k_amoeba_umutual2b");
+                            "k_amoeba_umutual2b", "k_amoeba_short_nbor");
   if (success!=0)
     return success;
 
@@ -157,16 +157,23 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int ainum=this->ans->inum(); 
+
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int _nall=this->atom->nall();
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-
+  int GX;
+  
+  GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  // NOTE: this->nbor->dev_packed is not allocated!!
+/*  
+  this->k_short_nbor.run(&this->atom->x, &_off2,
+                         &this->nbor->dev_nbor,  &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+*/
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
   this->k_udirect2b.set_size(GX,BX);
   this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 30db5ba334..9df1dbe485 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -781,8 +781,10 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
-  
+      if (r2>off2) {
+        if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj);
+        continue;
+      }
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
@@ -1091,3 +1093,112 @@ __kernel void k_special15(__global int * dev_nbor,
 
   } // if ii
 }
+
+/*
+__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
+                                   const numtyp off2, __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int new_numj=0;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    if (i == 0) printf("i = 0: numj before = %d\n", numj);
+    __global int *out_list=dev_nbor+nbor;
+    const int out_stride=n_stride;
+    
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+    
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK15;
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<=off2) {
+        *out_list=sj;
+        out_list+=out_stride;
+
+        new_numj++;
+        if (i == 0 && offset == 0) printf("neighbor of i = 0 within  off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
+      } else {
+        if (i == 0 && offset == 0) printf("neighbor of i = 0 outside off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
+      }
+    } // for nbor
+  } // if ii
+
+  if (t_per_atom>1) {
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1)
+      new_numj += shfl_down(new_numj, s, t_per_atom);
+  }
+  if (offset==0 && ii<inum) {
+    dev_nbor[ii+nbor_pitch]=new_numj;
+    if (i == 0) printf("i = 0: numj after = %d\n", new_numj);
+  }
+}
+*/
+#ifdef LAL_SIMD_IP_SYNC
+#define t_per_atom t_per_atom_in
+#else
+#define t_per_atom 1
+#endif
+
+__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
+                                   const numtyp off2,
+                                   __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom_in) {
+  const int ii=GLOBAL_ID_X;
+
+  if (ii<inum) {
+/*    
+    const int i=dev_packed[ii];
+    
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int newj=0;
+
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
+
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK15;
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<off2) {
+        //*out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
+      }
+
+    } // for nbor
+    //dev_nbor[ii+nbor_pitch]=newj;
+*/
+  } // if ii
+}
\ No newline at end of file
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 6bcd6c50c7..1419e16de1 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -40,6 +40,7 @@ BaseAmoebaT::~BaseAmoeba() {
   k_udirect2b.clear();
   k_umutual2b.clear();
   k_special15.clear();
+  k_short_nbor.clear();
   if (pair_program) delete pair_program;
 }
 
@@ -57,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              FILE *_screen, const void *pair_program,
                              const char *k_name_polar,
                              const char *k_name_udirect2b,
-                             const char *k_name_umutual2b) {
+                             const char *k_name_umutual2b,
+                             const char *k_name_short_nbor) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -89,19 +91,21 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b);
+  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,
+                  k_name_umutual2b,k_name_short_nbor);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
     _nbor_data=&(nbor->dev_packed);
-  } else
+  } else {
     _nbor_data=&(nbor->dev_nbor);
-
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
+  }
+    
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,false,_threads_per_atom);
   if (success!=0)
     return success;
-
+                              
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -223,6 +227,8 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
     add_onefive_neighbors();
   }
 
+  //nbor->copy_unpacked(inum,mn);
+
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
@@ -450,7 +456,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
-
+                         
   // ------------------- Resize _fieldp array ------------------------
 
   if (nall>_max_fieldp_size) {
@@ -692,7 +698,8 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_polar,
                                   const char *kname_udirect2b,
-                                  const char *kname_umutual2b) {
+                                  const char *kname_umutual2b,
+                                  const char *kname_short_nbor) {
   if (_compiled)
     return;
 
@@ -704,6 +711,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_polar.set_function(*pair_program,kname_polar);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
+  k_short_nbor.set_function(*pair_program,kname_short_nbor);
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 3fb752c97c..755f11610f 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -55,7 +55,7 @@ class BaseAmoeba {
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
                   const char *kname_polar, const char *kname_udirect2b,
-                  const char *kname_umutual2b);
+                  const char *kname_umutual2b, const char *kname_short_nbor);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -239,6 +239,7 @@ class BaseAmoeba {
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
   UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
+  UCL_Kernel k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
 
@@ -256,7 +257,7 @@ class BaseAmoeba {
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_polar, const char *kname_udirect2b,
-     const char *kname_umutual2b);
+     const char *kname_umutual2b, const char *kname_short_nbor);
 
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index b9ee884fa0..c51f741c0a 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
 
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -297,7 +297,7 @@ void PairAmoebaGPU::init_style()
   // set the energy unit conversion factor for polar real-space calculation
 
   double felec = 0.5 * electric / am_dielectric;
-  
+
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
                                 special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, atom->nlocal,