Added ucl_erfc to the opencl, cuda and hip backends; reverted to using erfc instead of approximation to ensure double-precision matches

2022-07-25 15:34:44 -05:00
parent 288fd5add4
commit 93784f35e3
5 changed files with 37 additions and 37 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -607,10 +607,13 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+      
      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@ -800,7 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
      //if (r2>off2) continue;

      numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
@ -850,10 +853,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -1005,7 +1010,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
      //if (r2>off2) continue;

      numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
@ -1031,10 +1036,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -1298,10 +1305,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -1124,10 +1124,13 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@ -1400,10 +1403,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -1551,7 +1556,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
      //if (r2>off2) continue;

      numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
@ -1589,10 +1594,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -1838,10 +1845,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,

      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@ -179,12 +179,15 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_erfc erfc

 #else

+#define ucl_exp expf
+#define ucl_powr powf
 #define ucl_atan atanf
 #define ucl_cbrt cbrtf
 #define ucl_ceil ceilf
@ -192,8 +195,7 @@
 #define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrtf
 #define ucl_sqrt sqrtf
-#define ucl_exp expf
-#define ucl_powr powf
+#define ucl_erfc erfcf

 #endif

--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -166,6 +166,7 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_erfc erfc

 #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)

--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -949,29 +949,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)

  amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp,
                               aewald, off2, &fieldp_pinned);
-/*
-  // accumulate the field and fieldp values from the GPU lib
-  //   field and fieldp may already have some nonzero values from kspace (umutual1)
-
-  int nlocal = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
-
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
-
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
-  }
-*/  
 }

 /* ----------------------------------------------------------------------