diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index e02d849ba4..2491493c45 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -4,7 +4,7 @@ units           real
 boundary        p p p
 
 atom_style amoeba
-
+#atom_modify sort 1000 7.0
 bond_style      class2
 angle_style     amoeba
 dihedral_style  none
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index c7b4872db0..0d78a8618a 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -57,7 +57,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
                   const double polar_uscale) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
-                            cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
+                            cell_size,gpu_split,_screen,amoeba,
+                            "k_amoeba_polar", "k_amoeba_udirect2b");
   if (success!=0)
     return success;
 
@@ -164,15 +165,14 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
-/*
-  this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,
-                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
-*/
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &_aewald, &_off2,
+                        &_polar_dscale, &_polar_uscale);
+
   this->time_pair.stop();
   return GX;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 3d28939d42..adcff0e648 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -91,8 +91,8 @@ _texture( q_tex,int2);
     tep[i]=t;                                                               \
   }
 
-#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
-                          i, field, fieldp)                                 \
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                              fieldp)                                       \
   if (t_per_atom>1) {                                                       \
     red_acc[0][tid]=_fieldp[0];                                             \
     red_acc[1][tid]=_fieldp[1];                                             \
@@ -118,8 +118,8 @@ _texture( q_tex,int2);
     numtyp4 f, fp;                                                          \
     f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
     fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
-    field[i] = f;                                                           \
-    fieldp[i] = fp;                                                         \
+    fieldp[ii] = f;                                                         \
+    fieldp[ii+inum] = fp;                                                   \
   }
 
 #else
@@ -152,8 +152,8 @@ _texture( q_tex,int2);
     tep[i]=t;                                                               \
   }
 
-#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
-                          i, field, fieldp)                                 \
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                             fieldp)                                        \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
@@ -168,8 +168,8 @@ _texture( q_tex,int2);
     numtyp4 f, fp;                                                          \
     f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
     fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
-    field[i] = f;                                                           \
-    fieldp[i] = fp;                                                         \
+    fieldp[ii] = f;                                                         \
+    fieldp[ii+inum] = fp;                                                   \
   }
 
 #endif
@@ -177,6 +177,11 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict extra,
                             const __global numtyp4 *restrict damping,
@@ -468,7 +473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
       term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
       numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
       numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
         (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
 
@@ -684,19 +689,23 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv);
 }
 
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
 __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
-                            const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict damping,
-                            const __global numtyp4 *restrict sp_polar,
-                            const __global int *dev_nbor,
-                            const __global int *dev_packed,
-                            __global numtyp4 *restrict field,
-                            __global numtyp4 *restrict fieldp,
-                            const int eflag, const int vflag, const int inum,
-                            const int nall, const int nbor_pitch, const int t_per_atom,
-                            const numtyp aewald, const numtyp felec,
-                            const numtyp off2, const numtyp polar_dscale,
-                            const numtyp polar_uscale)
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -771,7 +780,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
-      numtyp rr1 = felec * rinv;
+      numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
       numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
       numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
@@ -888,7 +897,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
   // accumulate field and fieldp
   
-  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,field,fieldp);
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 0c9a422cec..a1cf516777 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
   delete ans;
   delete nbor;
   k_polar.clear();
+  k_udirect2b.clear();
   k_special15.clear();
   if (pair_program) delete pair_program;
 }
@@ -53,7 +54,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const int maxspecial15,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name) {
+                             const char *k_name_polar,
+                             const char *k_name_udirect2b) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -85,7 +87,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -118,9 +120,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   if (ef_nall==0)
     ef_nall=2000;
 
-  _max_alloc_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-  _tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _max_fieldp_size = _max_tep_size;
+  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
@@ -224,7 +227,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
+void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, int *host_amtype,
                           int *host_amgroup, double **host_rpole,
                           double **host_uind, double **host_uinp,
@@ -252,9 +255,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
 
   // ------------------- Resize _tep array ------------------------
 
-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_alloc_size*4);
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -302,6 +305,10 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
 }
 
 // ---------------------------------------------------------------------------
@@ -338,9 +345,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
 
   // ------------------- Resize _tep array ------------------------
 
-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_alloc_size*4);
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -397,9 +404,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
 
   // copy tep from device to host
 
-  _tep.update_host(_max_alloc_size*4,false);
+  _tep.update_host(_max_tep_size*4,false);
 /*  
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size);
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
   for (int i = 0; i < 10; i++) {
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
@@ -442,9 +449,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
 
   // ------------------- Resize _fieldp array ------------------------
 
-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _fieldp.resize(_max_alloc_size*8);
+  if (nall>_max_fieldp_size) {
+    _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _fieldp.resize(_max_fieldp_size*8);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -492,13 +499,18 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   *jnum=nbor->host_acc.begin();
 
   const int red_blocks=udirect2b(eflag,vflag);
-  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
-  // copy field and fieldp from device to host
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
-  //_fieldp.update_host(_max_field_size*8,false);
+  _fieldp.update_host(_max_fieldp_size*8,false);
+/*  
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_field.cols(), _max_fieldp_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
+    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
 
   return nbor->host_jlist.begin()-host_start;
 }
@@ -566,7 +578,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
+                                  const char *kname_polar,
+                                  const char *kname_udirect2b) {
   if (_compiled)
     return;
 
@@ -575,7 +588,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   
-  k_polar.set_function(*pair_program,kname);
+  k_polar.set_function(*pair_program,kname_polar);
+  k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
@@ -593,6 +607,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
 
 }
 
+// ---------------------------------------------------------------------------
+//  Specify 1-5 neighbors from the current neighbor list
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 int BaseAmoebaT::add_onefive_neighbors() {
   // Compute the block size and grid size to keep all cores busy
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 7ef94c776e..ae0f33ef29 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -53,8 +53,8 @@ class BaseAmoeba {
     * - -5 Double precision is not supported on card **/
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
-                  const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const double gpu_split, FILE *screen, const void *pair_program,
+                  const char *kname_polar, const char *kname_udirect2b);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -129,7 +129,7 @@ class BaseAmoeba {
                        bool &success);
 
   /// Compute polar real-space with host neighboring (not active for now)
-  void compute(const int f_ago, const int inum_full, const int nall,
+  void compute_polar_real(const int f_ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double **host_uind,
                double **host_uinp, int *ilist, int *numj,
@@ -190,8 +190,8 @@ class BaseAmoeba {
     double** uind, double** uinp);
 
   /// Per-atom arrays
-  UCL_Vector<numtyp,numtyp> _tep,_fieldp;
-  int _max_alloc_size;
+  UCL_Vector<numtyp,numtyp> _tep, _fieldp;
+  int _max_tep_size, _max_fieldp_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
@@ -210,7 +210,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_polar,k_special15;
+  UCL_Kernel k_polar, k_udirect2b, k_special15;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
 
@@ -226,7 +226,8 @@ class BaseAmoeba {
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+     const char *kname_polar, const char *kname_udirect2b);
 
   virtual int polar_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3cdaa25633..a5cc86e39d 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -298,7 +298,7 @@ void PairAmoebaGPU::init_style()
 
 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
-  bool gpu_udirect2b_ready = false;
+  bool gpu_udirect2b_ready = true;
   if (!gpu_udirect2b_ready) {
     PairAmoeba::udirect2b(field, fieldp);
     return;
@@ -334,7 +334,28 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
                                         domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
-  
+
+  // get field and fieldp values from the GPU lib
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] = field_ptr[idx];
+    field[i][1] = field_ptr[idx+1];
+    field[i][2] = field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] = fieldp_ptr[idx];
+    fieldp[i][1] = fieldp_ptr[idx+1];
+    fieldp[i][2] = fieldp_ptr[idx+2];
+  }
+
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
 
diff --git a/src/atom.cpp b/src/atom.cpp
index 4ad5110ec9..86e2b1151b 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -2274,7 +2274,6 @@ void Atom::setup_sort_bins()
 #ifdef LMP_GPU
   if (userbinsize == 0.0) {
     int ifix = modify->find_fix("package_gpu");
-/*
     if (ifix >= 0) {
       const double subx = domain->subhi[0] - domain->sublo[0];
       const double suby = domain->subhi[1] - domain->sublo[1];
@@ -2298,7 +2297,6 @@ void Atom::setup_sort_bins()
       bininvy = bininv;
       bininvz = bininv;
     }
-*/    
   }
 #endif