diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 0d78a8618a..8bcbd6c4cb 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -58,7 +58,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_polar", "k_amoeba_udirect2b");
+                            "k_amoeba_polar", "k_amoeba_udirect2b",
+                            "k_amoeba_umutual2b");
   if (success!=0)
     return success;
 
@@ -152,7 +153,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the polar real-space term, returning tep
+// Calculate the real-space permanent field, returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
@@ -177,5 +178,31 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Calculate the real-space induced field, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::umutual2b(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int _nall=this->atom->nall();
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &_aewald, &_off2,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
 template class Amoeba<PRECISION,ACC_PRECISION>;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index c4f146a7c9..192f440112 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -907,6 +907,216 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+    
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+    numtyp ddi = damping[itype].z;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=dev_packed[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+          damp = pgamma * ucl_powr(r/damp,3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+        
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 842207dc73..ea4f8b9d1d 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -80,8 +80,10 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  int polar_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+  
 };
 
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 9baa7b30d3..6bcd6c50c7 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -38,6 +38,7 @@ BaseAmoebaT::~BaseAmoeba() {
   delete nbor;
   k_polar.clear();
   k_udirect2b.clear();
+  k_umutual2b.clear();
   k_special15.clear();
   if (pair_program) delete pair_program;
 }
@@ -55,7 +56,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
                              const char *k_name_polar,
-                             const char *k_name_udirect2b) {
+                             const char *k_name_udirect2b,
+                             const char *k_name_umutual2b) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -87,7 +89,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b);
+  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -230,7 +232,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall,
+void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, int *host_amtype,
                           int *host_amgroup, double **host_rpole,
                           double **host_uind, double **host_uinp,
@@ -473,6 +475,75 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the induced field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void** fieldp_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer extra data from the host
+  //   and build the neighbor lists if needed
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _fieldp array ------------------------
+
+  if (nall>_max_fieldp_size) {
+    _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _fieldp.resize(_max_fieldp_size*8);
+  }
+  *fieldp_ptr=_fieldp.host.begin();
+
+  const int red_blocks=umutual2b(eflag,vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  _fieldp.update_host(_max_fieldp_size*8,false);
+/*
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
+    this->_fieldp.cols(), _max_fieldp_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
+    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/
+  return firstneigh; //nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
@@ -551,7 +622,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
-
 template <class numtyp, class acctyp>
 double BaseAmoebaT::host_memory_usage_atomic() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
@@ -621,7 +691,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_polar,
-                                  const char *kname_udirect2b) {
+                                  const char *kname_udirect2b,
+                                  const char *kname_umutual2b) {
   if (_compiled)
     return;
 
@@ -632,6 +703,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   
   k_polar.set_function(*pair_program,kname_polar);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
+  k_umutual2b.set_function(*pair_program,kname_umutual2b);
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 7d4f4c00b5..3fb752c97c 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -54,7 +54,8 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_polar, const char *kname_udirect2b);
+                  const char *kname_polar, const char *kname_udirect2b,
+                  const char *kname_umutual2b);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -140,15 +141,31 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  /// Compute polar real-space with host neighboring (not active for now)
-  void compute_polar_real(const int f_ago, const int inum_full, const int nall,
-               double **host_x, int *host_type, int *host_amtype,
-               int *host_amgroup, double **host_rpole, double **host_uind,
-               double **host_uinp, int *ilist, int *numj,
-               int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *charge,
-               const int nlocal, double *boxlo, double *prd, void **tep_ptr);
+  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp,
+                double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+
+  /// Compute the real space part of the induced field (umutual2b) with device neighboring
+  int** compute_umutual2b(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp,
+                double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute polar real-space with device neighboring
   int** compute_polar_real(const int ago, const int inum_full, const int nall,
@@ -162,18 +179,15 @@ class BaseAmoeba {
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd, void **tep_ptr);
 
-  /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring
-  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole,
-                double **host_uind, double **host_uinp,
-                double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd, void **fieldp_ptr);                
+  /// Compute polar real-space with host neighboring (not active for now)
+  void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *host_amtype,
+               int *host_amgroup, double **host_rpole, double **host_uind,
+               double **host_uinp, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd, void **tep_ptr);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -224,7 +238,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_polar, k_udirect2b, k_special15;
+  UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
 
@@ -241,10 +255,13 @@ class BaseAmoeba {
   UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_polar, const char *kname_udirect2b);
+     const char *kname_polar, const char *kname_udirect2b,
+     const char *kname_umutual2b);
 
-  virtual int polar_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
+  virtual int umutual2b(const int eflag, const int vflag) = 0;
+  virtual int polar_real(const int eflag, const int vflag) = 0;
+  
 };
 
 }
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 0ec601de47..b2318d296e 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -365,9 +365,9 @@ class PairAmoeba : public Pair {
   void ulspred();
   void ufield0c(double **, double **);
   void uscale0b(int, double **, double **, double **, double **);
-  virtual void dfield0c(double **, double **);
+  void dfield0c(double **, double **);
   void umutual1(double **, double **);
-  void umutual2b(double **, double **);
+  virtual void umutual2b(double **, double **);
   void udirect1(double **);
   virtual void udirect2b(double **, double **);
   void dampmut(double, double, double, double *);
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3280e7b093..a1c21da3dd 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -74,7 +74,18 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
               int **ilist, int **jnum, const double cpu_time,
               bool &success, double *host_q, double *boxlo, double *prd,
               void **fieldp_ptr);
-
+/*
+int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, 
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag,
+              const bool eatom, const bool vatom, int &host_start,
+              int **ilist, int **jnum, const double cpu_time,
+              bool &success, double *host_q, double *boxlo, double *prd,
+              void **fieldp_ptr);
+*/
 int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -100,6 +111,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   tep_pinned = nullptr;
 
   gpu_udirect2b_ready = true;
+  gpu_umutual2b_ready = false;
   gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
@@ -142,16 +154,7 @@ void PairAmoebaGPU::polar_real()
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
   inum = atom->nlocal;
-/*
-  if (comm->me == 0) {
-    printf("GPU: polar real\n");
-    for (int i = 0; i < 20; i++) {
-      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
-        i, uind[i][0], uind[i][1], uind[i][2],
-        uinp[i][0], uinp[i][1], uinp[i][2]); 
-    }    
-  }
-*/
+
   firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                         atom->type, amtype, amgroup,
                                         rpole, uind, uinp, sublo, subhi,
@@ -993,6 +996,74 @@ void PairAmoebaGPU::udirect2b_cpu()
   }
 }
 
+/* ----------------------------------------------------------------------
+   umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
+{
+  if (!gpu_umutual2b_ready) {
+    PairAmoeba::umutual2b(field, fieldp);
+    return;
+  }
+/*   
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
+                                        atom->type, amtype, amgroup, rpole, uind, uinp,
+                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
+                                        atom->nspecial15, atom->special15,
+                                        eflag, vflag, eflag_atom, vflag_atom,
+                                        host_start, &ilist, &numneigh, cpu_time,
+                                        success, atom->q, domain->boxlo,
+                                        domain->prd, &fieldp_pinned);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (udirect1)
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+*/  
+}
+
 /* ---------------------------------------------------------------------- */
 
 double PairAmoebaGPU::memory_usage()
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index d0cbad90a2..4dc547e469 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -37,6 +37,7 @@ class PairAmoebaGPU : public PairAmoeba {
 
   virtual void polar_real();
   virtual void udirect2b(double **, double **);
+  virtual void umutual2b(double **, double **);
 
  private:
   int gpu_mode;
@@ -45,8 +46,9 @@ class PairAmoebaGPU : public PairAmoeba {
   void *fieldp_pinned;
   bool tep_single;
 
-  bool gpu_polar_real_ready;
   bool gpu_udirect2b_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
 
   void udirect2b_cpu();