From be2e437cecf47a20fcb5266277984b5a20a81bbd Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 14 Jul 2023 10:08:59 -0400 Subject: [PATCH] use approximation for erfc() on OpenCL for Intel since the OpenCL version seems broken --- lib/gpu/lal_amoeba.cu | 20 ++++++++++++++++++++ lib/gpu/lal_hippo.cu | 20 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 799cfe6d63..e7c313301e 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -585,7 +585,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[6]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; @@ -802,7 +807,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[4], bcn[3]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -976,7 +986,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[4]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1231,7 +1246,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[5]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 92cb55f69b..7de7bd594f 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1072,7 +1072,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[6]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; @@ -1319,7 +1324,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[4]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1477,7 +1487,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[4]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1702,7 +1717,12 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[5]; +#ifdef INTEL_OCL + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv; +#else bn[0] = ucl_erfc(ralpha) * rinv; +#endif numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0;