use approximation for erfc() on OpenCL for Intel since the OpenCL version seems broken

2023-07-14 10:08:59 -04:00
parent 620c60122a
commit be2e437cec
2 changed files with 40 additions and 0 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -585,7 +585,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[6];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
@ -802,7 +807,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[4], bcn[3];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -976,7 +986,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[4];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -1231,7 +1246,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[5];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -1072,7 +1072,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[6];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
@ -1319,7 +1324,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[4];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -1477,7 +1487,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[4];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp aefac = aesq2n;
      for (int m = 1; m <= 3; m++) {
@ -1702,7 +1717,12 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp bn[5];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
      bn[0] = ucl_erfc(ralpha) * rinv;
+#endif

      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;