use approximation for erfc() on OpenCL for Intel since the OpenCL version seems broken

This commit is contained in:
Axel Kohlmeyer
2023-07-14 10:08:59 -04:00
parent 620c60122a
commit be2e437cec
2 changed files with 40 additions and 0 deletions

View File

@ -585,7 +585,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[6];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;
@ -802,7 +807,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[4], bcn[3];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -976,7 +986,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[4];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -1231,7 +1246,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[5];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;

View File

@ -1072,7 +1072,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[6];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;
@ -1319,7 +1324,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[4];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -1477,7 +1487,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[4];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -1702,7 +1717,12 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
numtyp bn[5];
#ifdef INTEL_OCL
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
bn[0] = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a * rinv;
#else
bn[0] = ucl_erfc(ralpha) * rinv;
#endif
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;