diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 3b50feb6ed..d445305bb2 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -607,10 +607,13 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+      
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@@ -800,7 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
 
       numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
       numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
@@ -850,10 +853,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1005,7 +1010,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
 
       numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
       numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
@@ -1031,10 +1036,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1298,10 +1305,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index b47e2d50e3..4f31650f73 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1124,10 +1124,13 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@@ -1400,10 +1403,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1551,7 +1556,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
 
       numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
       numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
@@ -1589,10 +1594,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1838,10 +1845,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h
index 47a005b998..03c4fce85e 100644
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@@ -179,12 +179,15 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_erfc erfc
 
 #else
 
+#define ucl_exp expf
+#define ucl_powr powf
 #define ucl_atan atanf
 #define ucl_cbrt cbrtf
 #define ucl_ceil ceilf
@@ -192,8 +195,7 @@
 #define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrtf
 #define ucl_sqrt sqrtf
-#define ucl_exp expf
-#define ucl_powr powf
+#define ucl_erfc erfcf
 
 #endif
 
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 2ef8af0911..c734e67b98 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -166,6 +166,7 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_erfc erfc
 
 #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 1376a6bd12..3b0268f6b4 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -949,29 +949,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
 
   amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp,
                                aewald, off2, &fieldp_pinned);
-/*
-  // accumulate the field and fieldp values from the GPU lib
-  //   field and fieldp may already have some nonzero values from kspace (umutual1)
-
-  int nlocal = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
-
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
-
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
-  }
-*/  
 }
 
 /* ----------------------------------------------------------------------