another OpenCL bugfix attempt from Trung
This commit is contained in:
@ -83,8 +83,8 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (rsq<mie3[mtype].w) {
|
||||
numtyp r2inv = ucl_recip(rsq);
|
||||
numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
|
||||
numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
|
||||
numtyp rgamA = ucl_powr(r2inv,(mie1[mtype].z/(numtyp)2.0));
|
||||
numtyp rgamR = ucl_powr(r2inv,(mie1[mtype].w/(numtyp)2.0));
|
||||
numtyp forcemie = (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
|
||||
numtyp force = factor_lj*forcemie*r2inv;
|
||||
|
||||
@ -177,8 +177,8 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (rsq<mie3[mtype].w) {
|
||||
numtyp r2inv = ucl_recip(rsq);
|
||||
numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
|
||||
numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
|
||||
numtyp rgamA = ucl_powr(r2inv,(mie1[mtype].z/(numtyp)2.0));
|
||||
numtyp rgamR = ucl_powr(r2inv,(mie1[mtype].w/(numtyp)2.0));
|
||||
numtyp forcemie = (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
|
||||
numtyp force = factor_lj*forcemie*r2inv;
|
||||
|
||||
|
||||
@ -171,6 +171,7 @@
|
||||
#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
|
||||
|
||||
#define ucl_exp native_exp
|
||||
#define ucl_pow native_pow
|
||||
#define ucl_powr native_powr
|
||||
#define ucl_rsqrt native_rsqrt
|
||||
#define ucl_sqrt native_sqrt
|
||||
@ -179,6 +180,7 @@
|
||||
#else
|
||||
|
||||
#define ucl_exp exp
|
||||
#define ucl_pow pow
|
||||
#define ucl_powr powr
|
||||
#define ucl_rsqrt rsqrt
|
||||
#define ucl_sqrt sqrt
|
||||
|
||||
@ -36,14 +36,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
|
||||
z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
|
||||
}
|
||||
|
||||
ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
|
||||
ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
|
||||
{
|
||||
// return y = k * x (y can be at the same address as x)
|
||||
y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
|
||||
}
|
||||
|
||||
ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
|
||||
ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
|
||||
const numtyp y[3], numtyp z[3])
|
||||
{
|
||||
// return z = k * x + y (z can be at the same address as x)
|
||||
z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
|
||||
}
|
||||
|
||||
@ -83,9 +85,9 @@ ucl_inline void costheta_d(const numtyp cos_theta,
|
||||
const numtyp rij,
|
||||
const numtyp rik_hat[3],
|
||||
const numtyp rik,
|
||||
numtyp *dri,
|
||||
numtyp *drj,
|
||||
numtyp *drk)
|
||||
numtyp dri[3],
|
||||
numtyp drj[3],
|
||||
numtyp drk[3])
|
||||
{
|
||||
// first element is derivative wrt Ri, second wrt Rj, third wrt Rk
|
||||
vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj);
|
||||
@ -167,13 +169,13 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
|
||||
{
|
||||
const numtyp tmp = param_beta * zeta;
|
||||
if (tmp > param_c1) {
|
||||
*ans_d = param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
|
||||
*ans_d = param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5);
|
||||
return ucl_rsqrt(tmp);
|
||||
}
|
||||
if (tmp > param_c2) {
|
||||
const numtyp ptmp = ucl_powr(tmp,-param_powern);
|
||||
const numtyp ptmp = ucl_pow(tmp,-param_powern);
|
||||
const numtyp i2n = ucl_recip((numtyp)2.0 * param_powern);
|
||||
*ans_d = param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
|
||||
*ans_d = param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) *
|
||||
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 * i2n) *
|
||||
ptmp));
|
||||
return ((numtyp)1.0 - ptmp * i2n)*ucl_rsqrt(tmp);
|
||||
@ -183,14 +185,14 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
|
||||
return (numtyp)1.0;
|
||||
}
|
||||
if (tmp < param_c3) {
|
||||
*ans_d = (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
|
||||
return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
|
||||
*ans_d = (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0);
|
||||
return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern);
|
||||
}
|
||||
const numtyp tmp_n = (numtyp)1.0+ucl_powr(tmp,param_powern);
|
||||
const numtyp tmp_n = (numtyp)1.0+ucl_pow(tmp,param_powern);
|
||||
const numtyp i2n = -ucl_recip((numtyp)2.0*param_powern);
|
||||
*ans_d = (numtyp)-0.5*ucl_powr(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
|
||||
*ans_d = (numtyp)-0.5*ucl_pow(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
|
||||
zeta;
|
||||
return ucl_powr(tmp_n, i2n);
|
||||
return ucl_pow(tmp_n, i2n);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
@ -36,14 +36,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
|
||||
z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
|
||||
}
|
||||
|
||||
ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
|
||||
ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
|
||||
{
|
||||
// return y = k * x (y can be x)
|
||||
y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
|
||||
}
|
||||
|
||||
ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
|
||||
ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
|
||||
const numtyp y[3], numtyp z[3])
|
||||
{
|
||||
// return z = k * x + y (z can be x)
|
||||
z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
|
||||
}
|
||||
|
||||
@ -84,9 +86,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
|
||||
const numtyp rij,
|
||||
const numtyp rik_hat[3],
|
||||
const numtyp rik,
|
||||
numtyp *dri,
|
||||
numtyp *drj,
|
||||
numtyp *drk)
|
||||
numtyp dri[3],
|
||||
numtyp drj[3],
|
||||
numtyp drk[3])
|
||||
{
|
||||
// first element is derivative wrt Ri, second wrt Rj, third wrt Rk
|
||||
|
||||
@ -163,9 +165,9 @@ ucl_inline numtyp ters_bij(const numtyp zeta,
|
||||
{
|
||||
numtyp tmp = param_beta * zeta;
|
||||
if (tmp > param_ca1)
|
||||
return ucl_powr(tmp, -param_powern/((numtyp)2.0*param_powern_del));
|
||||
return ucl_pow(tmp, -param_powern/((numtyp)2.0*param_powern_del));
|
||||
if (tmp < param_ca4) return (numtyp)1.0;
|
||||
return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
|
||||
return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern),
|
||||
(numtyp)-1.0/((numtyp)2.0*param_powern_del));
|
||||
}
|
||||
|
||||
@ -180,12 +182,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
|
||||
{
|
||||
numtyp tmp = param_beta * zeta;
|
||||
if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) *
|
||||
ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
|
||||
ucl_pow(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
|
||||
if (tmp < param_ca4) return (numtyp)0.0;
|
||||
|
||||
numtyp tmp_n = ucl_powr(tmp,param_powern);
|
||||
numtyp tmp_n = ucl_pow(tmp,param_powern);
|
||||
return (numtyp)-0.5 *(param_powern/param_powern_del) *
|
||||
ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
|
||||
ucl_pow((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
|
||||
((numtyp)2.0*param_powern_del)))*tmp_n / zeta;
|
||||
}
|
||||
|
||||
|
||||
@ -37,14 +37,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
|
||||
z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
|
||||
}
|
||||
|
||||
ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
|
||||
ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
|
||||
{
|
||||
// return y = k * x (y can be at the same address as x)
|
||||
y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
|
||||
}
|
||||
|
||||
ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
|
||||
ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
|
||||
const numtyp y[3], numtyp z[3])
|
||||
{
|
||||
// return z = k * x + y (z can be at the same address as x)
|
||||
z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
|
||||
}
|
||||
|
||||
@ -85,9 +87,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
|
||||
const numtyp rij,
|
||||
const numtyp rik_hat[3],
|
||||
const numtyp rik,
|
||||
numtyp *dri,
|
||||
numtyp *drj,
|
||||
numtyp *drk)
|
||||
numtyp dri[3],
|
||||
numtyp drj[3],
|
||||
numtyp drk[3])
|
||||
{
|
||||
// first element is derivative wrt Ri, second wrt Rj, third wrt Rk
|
||||
|
||||
@ -187,12 +189,12 @@ ucl_inline numtyp ters_bij(const numtyp zeta,
|
||||
numtyp tmp = param_beta * zeta;
|
||||
if (tmp > param_c1) return ucl_rsqrt(tmp);
|
||||
if (tmp > param_c2)
|
||||
return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) /
|
||||
return ((numtyp)1.0 - ucl_pow(tmp,-param_powern) /
|
||||
((numtyp)2.0*param_powern))*ucl_rsqrt(tmp);
|
||||
if (tmp < param_c4) return (numtyp)1.0;
|
||||
if (tmp < param_c3)
|
||||
return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
|
||||
return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
|
||||
return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern);
|
||||
return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern),
|
||||
(numtyp)-1.0/((numtyp)2.0*param_powern));
|
||||
}
|
||||
|
||||
@ -208,19 +210,19 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
|
||||
{
|
||||
numtyp tmp = param_beta * zeta;
|
||||
if (tmp > param_c1)
|
||||
return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
|
||||
return param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5);
|
||||
if (tmp > param_c2)
|
||||
return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
|
||||
return param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) *
|
||||
// error in negligible 2nd term fixed 9/30/2015
|
||||
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
|
||||
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
|
||||
ucl_powr(tmp,-param_powern)));
|
||||
ucl_pow(tmp,-param_powern)));
|
||||
if (tmp < param_c4) return (numtyp)0.0;
|
||||
if (tmp < param_c3)
|
||||
return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
|
||||
return (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0);
|
||||
|
||||
numtyp tmp_n = ucl_powr(tmp,param_powern);
|
||||
return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) -
|
||||
numtyp tmp_n = ucl_pow(tmp,param_powern);
|
||||
return (numtyp)-0.5 * ucl_pow((numtyp)1.0+tmp_n, (numtyp) -
|
||||
(numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta;
|
||||
}
|
||||
|
||||
@ -474,7 +476,7 @@ ucl_inline void repulsive(const numtyp param_bigr,
|
||||
|
||||
numtyp esq = global_e*global_e;
|
||||
numtyp a_ij = ((numtyp)0.8854*global_a_0) /
|
||||
(ucl_powr(param_Z_i,(numtyp)0.23) + ucl_powr(param_Z_j,(numtyp)0.23));
|
||||
(ucl_pow(param_Z_i,(numtyp)0.23) + ucl_pow(param_Z_j,(numtyp)0.23));
|
||||
numtyp premult = (param_Z_i * param_Z_j * esq)/((numtyp)4.0*MY_PI*global_epsilon_0);
|
||||
numtyp r_ov_a = r/a_ij;
|
||||
numtyp t1 = (numtyp)0.1818*ucl_exp((numtyp)-3.2*r_ov_a);
|
||||
|
||||
@ -353,7 +353,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
||||
numtyp r4inv = rinvsq*rinvsq;
|
||||
numtyp r6inv = rinvsq*r4inv;
|
||||
|
||||
numtyp reta = pow(r,-param1_eta);
|
||||
numtyp reta = ucl_powr(r,-param1_eta);
|
||||
numtyp lam1r = r*param1_lam1inv;
|
||||
numtyp lam4r = r*param1_lam4inv;
|
||||
numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
|
||||
|
||||
Reference in New Issue
Block a user