another OpenCL bugfix attempt from Trung

This commit is contained in:
Axel Kohlmeyer
2023-02-07 17:31:43 -05:00
parent d170f83c6d
commit 3b4c873beb
6 changed files with 51 additions and 43 deletions

View File

@ -83,8 +83,8 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
int mtype=itype*lj_types+jtype;
if (rsq<mie3[mtype].w) {
numtyp r2inv = ucl_recip(rsq);
numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
numtyp rgamA = ucl_powr(r2inv,(mie1[mtype].z/(numtyp)2.0));
numtyp rgamR = ucl_powr(r2inv,(mie1[mtype].w/(numtyp)2.0));
numtyp forcemie = (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
numtyp force = factor_lj*forcemie*r2inv;
@ -177,8 +177,8 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
if (rsq<mie3[mtype].w) {
numtyp r2inv = ucl_recip(rsq);
numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
numtyp rgamA = ucl_powr(r2inv,(mie1[mtype].z/(numtyp)2.0));
numtyp rgamR = ucl_powr(r2inv,(mie1[mtype].w/(numtyp)2.0));
numtyp forcemie = (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
numtyp force = factor_lj*forcemie*r2inv;

View File

@ -171,6 +171,7 @@
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
#define ucl_exp native_exp
#define ucl_pow native_pow
#define ucl_powr native_powr
#define ucl_rsqrt native_rsqrt
#define ucl_sqrt native_sqrt
@ -179,6 +180,7 @@
#else
#define ucl_exp exp
#define ucl_pow pow
#define ucl_powr powr
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt

View File

@ -36,14 +36,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
}
ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
{
// return y = k * x (y can be at the same address as x)
y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
}
ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
const numtyp y[3], numtyp z[3])
{
// return z = k * x + y (z can be at the same address as x)
z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
}
@ -83,9 +85,9 @@ ucl_inline void costheta_d(const numtyp cos_theta,
const numtyp rij,
const numtyp rik_hat[3],
const numtyp rik,
numtyp *dri,
numtyp *drj,
numtyp *drk)
numtyp dri[3],
numtyp drj[3],
numtyp drk[3])
{
// first element is derivative wrt Ri, second wrt Rj, third wrt Rk
vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj);
@ -167,13 +169,13 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
{
const numtyp tmp = param_beta * zeta;
if (tmp > param_c1) {
*ans_d = param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
*ans_d = param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5);
return ucl_rsqrt(tmp);
}
if (tmp > param_c2) {
const numtyp ptmp = ucl_powr(tmp,-param_powern);
const numtyp ptmp = ucl_pow(tmp,-param_powern);
const numtyp i2n = ucl_recip((numtyp)2.0 * param_powern);
*ans_d = param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
*ans_d = param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) *
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 * i2n) *
ptmp));
return ((numtyp)1.0 - ptmp * i2n)*ucl_rsqrt(tmp);
@ -183,14 +185,14 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
return (numtyp)1.0;
}
if (tmp < param_c3) {
*ans_d = (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
*ans_d = (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0);
return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern);
}
const numtyp tmp_n = (numtyp)1.0+ucl_powr(tmp,param_powern);
const numtyp tmp_n = (numtyp)1.0+ucl_pow(tmp,param_powern);
const numtyp i2n = -ucl_recip((numtyp)2.0*param_powern);
*ans_d = (numtyp)-0.5*ucl_powr(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
*ans_d = (numtyp)-0.5*ucl_pow(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
zeta;
return ucl_powr(tmp_n, i2n);
return ucl_pow(tmp_n, i2n);
}
/* ---------------------------------------------------------------------- */

View File

@ -36,14 +36,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
}
ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
{
// return y = k * x (y can be x)
y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
}
ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
const numtyp y[3], numtyp z[3])
{
// return z = k * x + y (z can be x)
z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
}
@ -84,9 +86,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
const numtyp rij,
const numtyp rik_hat[3],
const numtyp rik,
numtyp *dri,
numtyp *drj,
numtyp *drk)
numtyp dri[3],
numtyp drj[3],
numtyp drk[3])
{
// first element is derivative wrt Ri, second wrt Rj, third wrt Rk
@ -163,9 +165,9 @@ ucl_inline numtyp ters_bij(const numtyp zeta,
{
numtyp tmp = param_beta * zeta;
if (tmp > param_ca1)
return ucl_powr(tmp, -param_powern/((numtyp)2.0*param_powern_del));
return ucl_pow(tmp, -param_powern/((numtyp)2.0*param_powern_del));
if (tmp < param_ca4) return (numtyp)1.0;
return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern),
(numtyp)-1.0/((numtyp)2.0*param_powern_del));
}
@ -180,12 +182,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
{
numtyp tmp = param_beta * zeta;
if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) *
ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
ucl_pow(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
if (tmp < param_ca4) return (numtyp)0.0;
numtyp tmp_n = ucl_powr(tmp,param_powern);
numtyp tmp_n = ucl_pow(tmp,param_powern);
return (numtyp)-0.5 *(param_powern/param_powern_del) *
ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
ucl_pow((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
((numtyp)2.0*param_powern_del)))*tmp_n / zeta;
}

View File

@ -37,14 +37,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
}
ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
{
// return y = k * x (y can be at the same address as x)
y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
}
ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
const numtyp y[3], numtyp z[3])
{
// return z = k * x + y (z can be at the same address as x)
z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
}
@ -85,9 +87,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
const numtyp rij,
const numtyp rik_hat[3],
const numtyp rik,
numtyp *dri,
numtyp *drj,
numtyp *drk)
numtyp dri[3],
numtyp drj[3],
numtyp drk[3])
{
// first element is derivative wrt Ri, second wrt Rj, third wrt Rk
@ -187,12 +189,12 @@ ucl_inline numtyp ters_bij(const numtyp zeta,
numtyp tmp = param_beta * zeta;
if (tmp > param_c1) return ucl_rsqrt(tmp);
if (tmp > param_c2)
return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) /
return ((numtyp)1.0 - ucl_pow(tmp,-param_powern) /
((numtyp)2.0*param_powern))*ucl_rsqrt(tmp);
if (tmp < param_c4) return (numtyp)1.0;
if (tmp < param_c3)
return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern);
return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern),
(numtyp)-1.0/((numtyp)2.0*param_powern));
}
@ -208,19 +210,19 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
{
numtyp tmp = param_beta * zeta;
if (tmp > param_c1)
return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
return param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5);
if (tmp > param_c2)
return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
return param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) *
// error in negligible 2nd term fixed 9/30/2015
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
ucl_powr(tmp,-param_powern)));
ucl_pow(tmp,-param_powern)));
if (tmp < param_c4) return (numtyp)0.0;
if (tmp < param_c3)
return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
return (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0);
numtyp tmp_n = ucl_powr(tmp,param_powern);
return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) -
numtyp tmp_n = ucl_pow(tmp,param_powern);
return (numtyp)-0.5 * ucl_pow((numtyp)1.0+tmp_n, (numtyp) -
(numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta;
}
@ -474,7 +476,7 @@ ucl_inline void repulsive(const numtyp param_bigr,
numtyp esq = global_e*global_e;
numtyp a_ij = ((numtyp)0.8854*global_a_0) /
(ucl_powr(param_Z_i,(numtyp)0.23) + ucl_powr(param_Z_j,(numtyp)0.23));
(ucl_pow(param_Z_i,(numtyp)0.23) + ucl_pow(param_Z_j,(numtyp)0.23));
numtyp premult = (param_Z_i * param_Z_j * esq)/((numtyp)4.0*MY_PI*global_epsilon_0);
numtyp r_ov_a = r/a_ij;
numtyp t1 = (numtyp)0.1818*ucl_exp((numtyp)-3.2*r_ov_a);

View File

@ -353,7 +353,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp r4inv = rinvsq*rinvsq;
numtyp r6inv = rinvsq*r4inv;
numtyp reta = pow(r,-param1_eta);
numtyp reta = ucl_powr(r,-param1_eta);
numtyp lam1r = r*param1_lam1inv;
numtyp lam4r = r*param1_lam4inv;
numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;