diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 3b50feb6ed..d445305bb2 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -607,10 +607,13 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); @@ -800,7 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); + numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; @@ -850,10 +853,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1005,7 +1010,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); + numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; @@ -1031,10 +1036,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1298,10 +1305,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index b47e2d50e3..4f31650f73 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1124,10 +1124,13 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); @@ -1400,10 +1403,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1551,7 +1556,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); + numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; @@ -1589,10 +1594,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1838,10 +1845,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h index 47a005b998..03c4fce85e 100644 --- a/lib/gpu/lal_pre_cuda_hip.h +++ b/lib/gpu/lal_pre_cuda_hip.h @@ -179,12 +179,15 @@ #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs +#define ucl_recip(x) ((numtyp)1.0/(x)) #define ucl_rsqrt rsqrt #define ucl_sqrt sqrt -#define ucl_recip(x) ((numtyp)1.0/(x)) +#define ucl_erfc erfc #else +#define ucl_exp expf +#define ucl_powr powf #define ucl_atan atanf #define ucl_cbrt cbrtf #define ucl_ceil ceilf @@ -192,8 +195,7 @@ #define ucl_recip(x) ((numtyp)1.0/(x)) #define ucl_rsqrt rsqrtf #define ucl_sqrt sqrtf -#define ucl_exp expf -#define ucl_powr powf +#define ucl_erfc erfcf #endif diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 2ef8af0911..c734e67b98 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -166,6 +166,7 @@ #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs +#define ucl_erfc erfc #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 1376a6bd12..3b0268f6b4 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -949,29 +949,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald, off2, &fieldp_pinned); -/* - // accumulate the field and fieldp values from the GPU lib - // field and fieldp may already have some nonzero values from kspace (umutual1) - - int nlocal = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; - - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } - - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; - } -*/ } /* ----------------------------------------------------------------------