From be2e437cecf47a20fcb5266277984b5a20a81bbd Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 14 Jul 2023 10:08:59 -0400
Subject: [PATCH] use approximation for erfc() on OpenCL for Intel since the
 OpenCL version seems broken

---
 lib/gpu/lal_amoeba.cu | 20 ++++++++++++++++++++
 lib/gpu/lal_hippo.cu  | 20 ++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 799cfe6d63..e7c313301e 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -585,7 +585,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[6];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
@@ -802,7 +807,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[4], bcn[3];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -976,7 +986,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[4];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1231,7 +1246,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[5];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 92cb55f69b..7de7bd594f 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1072,7 +1072,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[6];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
@@ -1319,7 +1324,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[4];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1477,7 +1487,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[4];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1702,7 +1717,12 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[5];
+#ifdef INTEL_OCL
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
+#else
       bn[0] = ucl_erfc(ralpha) * rinv;
+#endif
 
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;