diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu index fedfaf157a..61ac742321 100644 --- a/lib/gpu/lal_mie.cu +++ b/lib/gpu/lal_mie.cu @@ -83,8 +83,8 @@ __kernel void k_mie(const __global numtyp4 *restrict x_, int mtype=itype*lj_types+jtype; if (rsq param_c1) { - *ans_d = param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5); + *ans_d = param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5); return ucl_rsqrt(tmp); } if (tmp > param_c2) { - const numtyp ptmp = ucl_powr(tmp,-param_powern); + const numtyp ptmp = ucl_pow(tmp,-param_powern); const numtyp i2n = ucl_recip((numtyp)2.0 * param_powern); - *ans_d = param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) * + *ans_d = param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) * ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 * i2n) * ptmp)); return ((numtyp)1.0 - ptmp * i2n)*ucl_rsqrt(tmp); @@ -183,14 +185,14 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta, return (numtyp)1.0; } if (tmp < param_c3) { - *ans_d = (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0); - return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern); + *ans_d = (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0); + return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern); } - const numtyp tmp_n = (numtyp)1.0+ucl_powr(tmp,param_powern); + const numtyp tmp_n = (numtyp)1.0+ucl_pow(tmp,param_powern); const numtyp i2n = -ucl_recip((numtyp)2.0*param_powern); - *ans_d = (numtyp)-0.5*ucl_powr(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/ + *ans_d = (numtyp)-0.5*ucl_pow(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/ zeta; - return ucl_powr(tmp_n, i2n); + return ucl_pow(tmp_n, i2n); } /* ---------------------------------------------------------------------- */ diff --git a/lib/gpu/lal_tersoff_mod_extra.h b/lib/gpu/lal_tersoff_mod_extra.h index fb658cb0da..abf9e5dcd0 100644 --- a/lib/gpu/lal_tersoff_mod_extra.h +++ b/lib/gpu/lal_tersoff_mod_extra.h @@ -36,14 +36,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3]) z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2]; } -ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3]) +ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3]) { + // return y = k * x (y can be x) y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2]; } -ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3], +ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3], const numtyp y[3], numtyp z[3]) { + // return z = k * x + y (z can be x) z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2]; } @@ -84,9 +86,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3], const numtyp rij, const numtyp rik_hat[3], const numtyp rik, - numtyp *dri, - numtyp *drj, - numtyp *drk) + numtyp dri[3], + numtyp drj[3], + numtyp drk[3]) { // first element is derivative wrt Ri, second wrt Rj, third wrt Rk @@ -163,9 +165,9 @@ ucl_inline numtyp ters_bij(const numtyp zeta, { numtyp tmp = param_beta * zeta; if (tmp > param_ca1) - return ucl_powr(tmp, -param_powern/((numtyp)2.0*param_powern_del)); + return ucl_pow(tmp, -param_powern/((numtyp)2.0*param_powern_del)); if (tmp < param_ca4) return (numtyp)1.0; - return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern), + return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern), (numtyp)-1.0/((numtyp)2.0*param_powern_del)); } @@ -180,12 +182,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta, { numtyp tmp = param_beta * zeta; if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) * - ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta; + ucl_pow(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta; if (tmp < param_ca4) return (numtyp)0.0; - numtyp tmp_n = ucl_powr(tmp,param_powern); + numtyp tmp_n = ucl_pow(tmp,param_powern); return (numtyp)-0.5 *(param_powern/param_powern_del) * - ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 / + ucl_pow((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 / ((numtyp)2.0*param_powern_del)))*tmp_n / zeta; } diff --git a/lib/gpu/lal_tersoff_zbl_extra.h b/lib/gpu/lal_tersoff_zbl_extra.h index 9e5bcb10b4..d9b691fdfa 100644 --- a/lib/gpu/lal_tersoff_zbl_extra.h +++ b/lib/gpu/lal_tersoff_zbl_extra.h @@ -37,14 +37,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3]) z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2]; } -ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3]) +ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3]) { + // return y = k * x (y can be at the same address as x) y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2]; } -ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3], +ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3], const numtyp y[3], numtyp z[3]) { + // return z = k * x + y (z can be at the same address as x) z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2]; } @@ -85,9 +87,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3], const numtyp rij, const numtyp rik_hat[3], const numtyp rik, - numtyp *dri, - numtyp *drj, - numtyp *drk) + numtyp dri[3], + numtyp drj[3], + numtyp drk[3]) { // first element is derivative wrt Ri, second wrt Rj, third wrt Rk @@ -187,12 +189,12 @@ ucl_inline numtyp ters_bij(const numtyp zeta, numtyp tmp = param_beta * zeta; if (tmp > param_c1) return ucl_rsqrt(tmp); if (tmp > param_c2) - return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) / + return ((numtyp)1.0 - ucl_pow(tmp,-param_powern) / ((numtyp)2.0*param_powern))*ucl_rsqrt(tmp); if (tmp < param_c4) return (numtyp)1.0; if (tmp < param_c3) - return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern); - return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern), + return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern); + return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern), (numtyp)-1.0/((numtyp)2.0*param_powern)); } @@ -208,19 +210,19 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta, { numtyp tmp = param_beta * zeta; if (tmp > param_c1) - return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5); + return param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5); if (tmp > param_c2) - return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) * + return param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) * // error in negligible 2nd term fixed 9/30/2015 // (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) * ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) * - ucl_powr(tmp,-param_powern))); + ucl_pow(tmp,-param_powern))); if (tmp < param_c4) return (numtyp)0.0; if (tmp < param_c3) - return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0); + return (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0); - numtyp tmp_n = ucl_powr(tmp,param_powern); - return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) - + numtyp tmp_n = ucl_pow(tmp,param_powern); + return (numtyp)-0.5 * ucl_pow((numtyp)1.0+tmp_n, (numtyp) - (numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta; } @@ -474,7 +476,7 @@ ucl_inline void repulsive(const numtyp param_bigr, numtyp esq = global_e*global_e; numtyp a_ij = ((numtyp)0.8854*global_a_0) / - (ucl_powr(param_Z_i,(numtyp)0.23) + ucl_powr(param_Z_j,(numtyp)0.23)); + (ucl_pow(param_Z_i,(numtyp)0.23) + ucl_pow(param_Z_j,(numtyp)0.23)); numtyp premult = (param_Z_i * param_Z_j * esq)/((numtyp)4.0*MY_PI*global_epsilon_0); numtyp r_ov_a = r/a_ij; numtyp t1 = (numtyp)0.1818*ucl_exp((numtyp)-3.2*r_ov_a); diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu index 6c9ba14b4a..f4595f4715 100644 --- a/lib/gpu/lal_vashishta.cu +++ b/lib/gpu/lal_vashishta.cu @@ -353,7 +353,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, numtyp r4inv = rinvsq*rinvsq; numtyp r6inv = rinvsq*r4inv; - numtyp reta = pow(r,-param1_eta); + numtyp reta = ucl_powr(r,-param1_eta); numtyp lam1r = r*param1_lam1inv; numtyp lam4r = r*param1_lam4inv; numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;