diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 0f87104832..77bbebbb9a 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
 
   // leave the answers (forces, energies and virial) on the device,
   //   only copy them back in the last kernel (this one, or polar_real once done)
-  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //this->device->add_ans_object(this->ans);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
 
@@ -444,7 +444,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
   }
-*/  
+*/
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 1f9c14d4da..95f18db7d2 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1032,6 +1032,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
       numtyp valk = polar6[j].x;
 
+      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
       // intermediates involving moments and separation distance
 
       numtyp dir = dix*xr + diy*yr + diz*zr;
@@ -1772,7 +1773,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
-      
+
       const numtyp4 pol1j = polar1[j];
       numtyp ck = polar1[j].x;   // rpole[j][0];
       numtyp dkx = polar1[j].y;  // rpole[j][1];
@@ -1800,6 +1801,11 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
+      // NOTE: for in.water_box/water_hexamer.hippo: there exist wscale = 0.2
+      //if (factor_wscale < (numtyp)1.0) continue; //factor_wscale = (numtyp)0;
+
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, r, factor_wscale);
+
       if (igroup == jgroup) {
         factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
         factor_uscale = polar_uscale;
@@ -1910,7 +1916,8 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
       dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
       dufld[5] += zr*tiz5 + zr*zr*tuir;
-      
+
+
       // get the field gradient for direct polarization force
       
       numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
@@ -1929,7 +1936,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       term1k = rr3k - rr5k*xr*xr;
       term2k = (numtyp)2.0*rr5k*xr;
       term3k = rr7k*xr*xr - rr5k;
-      term4k = 2.0*rr5k;
+      term4k = (numtyp)2.0*rr5k;
       term5k = (numtyp)5.0*rr7k*xr;
       term6k = rr9k*xr*xr;
       tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i -
@@ -2046,7 +2053,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp frcx = (numtyp)-2.0 * depx;
       numtyp frcy = (numtyp)-2.0 * depy;
       numtyp frcz = (numtyp)-2.0 * depz;
-
+       
       // get the dEp/dR terms used for direct polarization force
       // poltyp == MUTUAL && hippo
       // tixx and tkxx
@@ -2159,7 +2166,7 @@ __kernel void k_special15(__global int * dev_nbor,
       int which = sj >> SBBITS & 3;
       int j = sj & NEIGHMASK;
       tagint jtag = tag[j];
-
+      if (i == 0 && j < 20) printf("GPU: j = %d; jtag = %d\n", j, jtag);
       if (!which) {
         int offset=ii;
         for (int k=0; k<n15; k++) {
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index 890ce51121..c5a97d919a 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -81,14 +81,14 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     pre = (numtyp)128.0;
     s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
 
-    ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0;
-    d2s = dmpi24 * expi * r5 / 9.0;
-    d3s = dmpi25 * expi * r6 / 45.0;
-    d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0;
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
+    d2s = dmpi24 * expi * r5 / (numtyp)9.0;
+    d3s = dmpi25 * expi * r6 / (numtyp)45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
     if (rorder >= 11) {
       r8 = r7 * r;
       dmpi27 = dmpi2 * dmpi26;
-      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
     }
 
   // treat the case where alpha damping exponents are unequal
@@ -97,12 +97,12 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     r3 = r2 * r;
     r4 = r3 * r;
     r5 = r4 * r;
-    dmpi2 = 0.5 * dmpi;
-    dmpk2 = 0.5 * dmpk;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dmpk2 = (numtyp)0.5 * dmpk;
     dampi = dmpi2 * r;
     dampk = dmpk2 * r;
-    expi = exp(-dampi);
-    expk = exp(-dampk);
+    expi = ucl_exp(-dampi);
+    expk = ucl_exp(-dampk);
     dmpi22 = dmpi2 * dmpi2;
     dmpi23 = dmpi22 * dmpi2;
     dmpi24 = dmpi23 * dmpi2;
@@ -112,34 +112,34 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     dmpk24 = dmpk23 * dmpk2;
     dmpk25 = dmpk24 * dmpk2;
     term = dmpi22 - dmpk22;
-    pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0);
-    tmp = 4.0 * dmpi2 * dmpk2 / term;
+    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / ucl_powr(term,(numtyp)4.0);
+    tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
     s = (dampi-tmp)*expk + (dampk+tmp)*expi;
 
-    ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - 
-          4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
-    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - 
-           (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-           4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + 
-       (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
-       4.0*dmpi2*dmpk2/term) * expi;
-    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - 
-           (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - 
-           4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
-      (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + 
-       (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + 
-       4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi;
-    d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + 
-           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - 
-           (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - 
-           (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-           4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + 
-       dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + 
-       (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + 
-       (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - 
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - 
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + 
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - 
+           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - 
+           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + 
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + 
+       (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + 
+           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - 
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - 
+           ((numtyp)12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + 
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + 
+       (4.0/105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/21.0)*dmpi24*dmpk2*r3/term + 
+       (12.0/7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
        4.0*dmpi2*dmpk2/term) * expi;
     
     if (rorder >= 11) {
@@ -217,8 +217,8 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   diff = fabs(alphai-alphak);
   dampi = alphai * r;
   dampk = alphak * r;
-  expi = exp(-dampi);
-  expk = exp(-dampk);
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
 
   // core-valence charge penetration damping for Gordon f1
 
@@ -308,15 +308,15 @@ ucl_inline void damppole(const numtyp r, const int rorder,
     if (rorder >= 11) {
       dampi6 = dampi3 * dampi3;
       dampk6 = dampk3 * dampk3;
-      dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                                5.0*dampi4/126.0 + 2.0*dampi5/315.0 + 
-                                dampi6/1890.0)*expi - 
-        termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + 
-                2.0*dampk5/315.0 + dampk6/1890.0)*expk - 
-        2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + 
-                          dampi4/63.0 + dampi5/945.0)*expi - 
-        2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + 
-                          dampk4/63.0 + dampk5/945.0)*expk;
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + 
+                                dampi6/(numtyp)1890.0)*expi - 
+        termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + 5.0*dampk4/(numtyp)126.0 + 
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - 
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + 
+                          dampi4/63.0 + dampi5/(numtyp)945.0)*expi - 
+        (numtyp)2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + 
+                          dampk4/63.0 + dampk5/(numtyp)945.0)*expk;
     }
   }
 }
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index e6b3e6ef70..4fa8a5d892 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -366,6 +366,7 @@ void PairAmoeba::polar_real()
       yr = x[j][1] - yi;
       zr = x[j][2] - zi;
       r2 = xr*xr + yr*yr + zr*zr;
+
       if (r2 > off2) continue; 
 
       jtype = amtype[j];
@@ -393,7 +394,7 @@ void PairAmoeba::polar_real()
 	  factor_uscale = 1.0;
 	}
       }
-
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
       r = sqrt(r2);
       ck = rpole[j][0];
       dkx = rpole[j][1];
@@ -567,7 +568,7 @@ void PairAmoeba::polar_real()
       ufld[j][0] += tkx3 + xr*tukr;
       ufld[j][1] += tky3 + yr*tukr;
       ufld[j][2] += tkz3 + zr*tukr;
-      
+
       // get induced dipole field gradient used for quadrupole torques
       
       if (amoeba) {
@@ -579,7 +580,6 @@ void PairAmoeba::polar_real()
         tkz5 = 2.0 * (psr5*uiz+dsr5*uizp);
         tuir = -psr7*ukr - dsr7*ukrp;
         tukr = -psr7*uir - dsr7*uirp;
-      // reached here...
       } else if (hippo) {
         tix5 = 4.0 * (rr5i*ukx);
         tiy5 = 4.0 * (rr5i*uky);
@@ -597,7 +597,6 @@ void PairAmoeba::polar_real()
       dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
       dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
       dufld[i][5] += zr*tiz5 + zr*zr*tuir;
-
       dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
       dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
       dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@@ -668,7 +667,7 @@ void PairAmoeba::polar_real()
         frcx = depx;
         frcy = depy;
         frcz = depz;
-        
+
         // get the dEp/dR terms used for direct polarization force
         
         term1 = bn[2] - psc3*rr5;
@@ -855,6 +854,7 @@ void PairAmoeba::polar_real()
         frcx = -2.0 * depx;
         frcy = -2.0 * depy;
         frcz = -2.0 * depz;
+
       }
 
       // get the dtau/dr terms used for mutual polarization force
@@ -1199,6 +1199,8 @@ void PairAmoeba::polar_real()
 
     torque2force(i,tep,fix,fiy,fiz,fpolar);
 
+    //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, tep[0], tep[1], tep[2]);
+
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 5157739f0e..1ff35e7ce1 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -242,6 +242,47 @@ void PairAmoeba::compute(int eflag, int vflag)
     time_induce = time_polar = time_qxfer = 0.0;
   }
 
+  { // DEBUGGING
+  double **x = atom->x;
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  int i,ii,j,jj;
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double xi = x[i][0];
+    double yi = x[i][1];
+    double zi = x[i][2];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      double factor_mpole = special_mpole[sbmask15(j)];
+      j &= NEIGHMASK15;
+
+      double xr = x[j][0] - xi;
+      double yr = x[j][1] - yi;
+      double zr = x[j][2] - zi;
+      double r2 = xr*xr + yr*yr + zr*zr;
+      if (r2 > off2) continue;
+      double r = sqrt(r2);
+      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
+    }
+  }
+  
+
+  } // DEBUGGING
+
   double evdwl;
 
   evdwl = 0.0;
@@ -973,8 +1014,8 @@ void PairAmoeba::init_style()
 
   int irequest = neighbor->request(this,instance_me);
   // for DEBUGGING with GPU
-  //neighbor->requests[irequest]->half = 0;
-  //neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
 
   // open debug output files
   // names are hard-coded
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 23395e5fe3..4da0056029 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = false;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -1137,6 +1137,8 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
     _tq[2] = tq_ptr[4*i+2];
     torque2force(i,_tq,fix,fiy,fiz,force_comp);
 
+    //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, _tq[0], _tq[1], _tq[2]);
+
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];