Added ucl_erfc to the opencl, cuda and hip backends; reverted to using erfc instead of approximation to ensure double-precision matches

This commit is contained in:
Trung Nguyen
2022-07-25 15:34:44 -05:00
parent 288fd5add4
commit 93784f35e3
5 changed files with 37 additions and 37 deletions

View File

@ -607,10 +607,13 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;
if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@ -800,7 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
//if (r2>off2) continue;
numtyp r = ucl_sqrt(r2);
numtyp rinv = ucl_recip(r);
numtyp rinv = ucl_rsqrt(r2);
numtyp r2inv = rinv*rinv;
numtyp rr1 = rinv;
numtyp rr3 = rr1 * r2inv;
@ -850,10 +853,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -1005,7 +1010,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
//if (r2>off2) continue;
numtyp r = ucl_sqrt(r2);
numtyp rinv = ucl_recip(r);
numtyp rinv = ucl_rsqrt(r2);
numtyp r2inv = rinv*rinv;
numtyp rr1 = rinv;
numtyp rr3 = rr1 * r2inv;
@ -1031,10 +1036,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -1298,10 +1305,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;
if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);

View File

@ -1124,10 +1124,13 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;
if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@ -1400,10 +1403,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -1551,7 +1556,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
//if (r2>off2) continue;
numtyp r = ucl_sqrt(r2);
numtyp rinv = ucl_recip(r);
numtyp rinv = ucl_rsqrt(r2);
numtyp r2inv = rinv*rinv;
numtyp rr1 = rinv;
numtyp rr3 = rr1 * r2inv;
@ -1589,10 +1594,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp aefac = aesq2n;
for (int m = 1; m <= 3; m++) {
@ -1838,10 +1845,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
numtyp ralpha = aewald * r;
numtyp exp2a = ucl_exp(-ralpha*ralpha);
/*
numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
//bn[0] = erfc(ralpha) / r;
bn[0] = _erfc * rinv;
*/
bn[0] = ucl_erfc(ralpha) * rinv;
numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
numtyp alsq2n = (numtyp)0.0;
if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);

View File

@ -179,12 +179,15 @@
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_erfc erfc
#else
#define ucl_exp expf
#define ucl_powr powf
#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
@ -192,8 +195,7 @@
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#define ucl_exp expf
#define ucl_powr powf
#define ucl_erfc erfcf
#endif

View File

@ -166,6 +166,7 @@
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_erfc erfc
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)

View File

@ -949,29 +949,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp,
aewald, off2, &fieldp_pinned);
/*
// accumulate the field and fieldp values from the GPU lib
// field and fieldp may already have some nonzero values from kspace (umutual1)
int nlocal = atom->nlocal;
double *field_ptr = (double *)fieldp_pinned;
for (int i = 0; i < nlocal; i++) {
int idx = 4*i;
field[i][0] += field_ptr[idx];
field[i][1] += field_ptr[idx+1];
field[i][2] += field_ptr[idx+2];
}
double* fieldp_ptr = (double *)fieldp_pinned;
fieldp_ptr += 4*inum;
for (int i = 0; i < nlocal; i++) {
int idx = 4*i;
fieldp[i][0] += fieldp_ptr[idx];
fieldp[i][1] += fieldp_ptr[idx+1];
fieldp[i][2] += fieldp_ptr[idx+2];
}
*/
}
/* ----------------------------------------------------------------------