another OpenCL bugfix attempt from Trung

2023-02-07 17:31:43 -05:00
parent d170f83c6d
commit 3b4c873beb
6 changed files with 51 additions and 43 deletions
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@ -83,8 +83,8 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
      int mtype=itype*lj_types+jtype;
      if (rsq<mie3[mtype].w) {
        numtyp r2inv = ucl_recip(rsq);
-        numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
-        numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
+        numtyp rgamA = ucl_powr(r2inv,(mie1[mtype].z/(numtyp)2.0));
+        numtyp rgamR = ucl_powr(r2inv,(mie1[mtype].w/(numtyp)2.0));
        numtyp forcemie =  (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
        numtyp force = factor_lj*forcemie*r2inv;

@ -177,8 +177,8 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,

      if (rsq<mie3[mtype].w) {
        numtyp r2inv = ucl_recip(rsq);
-        numtyp rgamA = pow(r2inv,(mie1[mtype].z/(numtyp)2.0));
-        numtyp rgamR = pow(r2inv,(mie1[mtype].w/(numtyp)2.0));
+        numtyp rgamA = ucl_powr(r2inv,(mie1[mtype].z/(numtyp)2.0));
+        numtyp rgamR = ucl_powr(r2inv,(mie1[mtype].w/(numtyp)2.0));
        numtyp forcemie =  (mie1[mtype].x*rgamR - mie1[mtype].y*rgamA);
        numtyp force = factor_lj*forcemie*r2inv;

--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -171,6 +171,7 @@
 #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)

 #define ucl_exp native_exp
+#define ucl_pow native_pow
 #define ucl_powr native_powr
 #define ucl_rsqrt native_rsqrt
 #define ucl_sqrt native_sqrt
@ -179,6 +180,7 @@
 #else

 #define ucl_exp exp
+#define ucl_pow pow
 #define ucl_powr powr
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@ -36,14 +36,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
  z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
 }

-ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
+ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
 {
+  // return y = k * x  (y can be at the same address as x)
  y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
 }

-ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
+ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
                              const numtyp y[3], numtyp z[3])
 {
+ // return z = k * x + y  (z can be at the same address as x)
  z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
 }

@ -83,9 +85,9 @@ ucl_inline void costheta_d(const numtyp cos_theta,
                           const numtyp rij,
                           const numtyp rik_hat[3],
                           const numtyp rik,
-                           numtyp *dri,
-                           numtyp *drj,
-                           numtyp *drk)
+                           numtyp dri[3],
+                           numtyp drj[3],
+                           numtyp drk[3])
 {
  // first element is derivative wrt Ri, second wrt Rj, third wrt Rk
  vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj);
@ -167,13 +169,13 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
 {
  const numtyp tmp = param_beta * zeta;
  if (tmp > param_c1) {
-    *ans_d = param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
+    *ans_d = param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5);
    return ucl_rsqrt(tmp);
  }
  if (tmp > param_c2) {
-    const numtyp ptmp = ucl_powr(tmp,-param_powern);
+    const numtyp ptmp = ucl_pow(tmp,-param_powern);
    const numtyp i2n = ucl_recip((numtyp)2.0 * param_powern);
-    *ans_d = param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
+    *ans_d = param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) *
                           ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 * i2n) *
                            ptmp));
    return ((numtyp)1.0 - ptmp * i2n)*ucl_rsqrt(tmp);
@ -183,14 +185,14 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
    return (numtyp)1.0;
  }
  if (tmp < param_c3) {
-    *ans_d = (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
-    return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
+    *ans_d = (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0);
+    return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern);
  }
-  const numtyp tmp_n = (numtyp)1.0+ucl_powr(tmp,param_powern);
+  const numtyp tmp_n = (numtyp)1.0+ucl_pow(tmp,param_powern);
  const numtyp i2n = -ucl_recip((numtyp)2.0*param_powern);
-  *ans_d = (numtyp)-0.5*ucl_powr(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
+  *ans_d = (numtyp)-0.5*ucl_pow(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
    zeta;
-  return ucl_powr(tmp_n, i2n);
+  return ucl_pow(tmp_n, i2n);
 }

 /* ---------------------------------------------------------------------- */
--- a/lib/gpu/lal_tersoff_mod_extra.h
+++ b/lib/gpu/lal_tersoff_mod_extra.h
@ -36,14 +36,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
  z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
 }

-ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
+ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
 {
+  // return y = k * x  (y can be x)
  y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
 }

-ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
+ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
                              const numtyp y[3], numtyp z[3])
 {
+  // return z = k * x + y  (z can be x)
  z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
 }

@ -84,9 +86,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
                           const numtyp rij,
                           const numtyp rik_hat[3],
                           const numtyp rik,
-                           numtyp *dri,
-                           numtyp *drj,
-                           numtyp *drk)
+                           numtyp dri[3],
+                           numtyp drj[3],
+                           numtyp drk[3])
 {
  // first element is derivative wrt Ri, second wrt Rj, third wrt Rk

@ -163,9 +165,9 @@ ucl_inline numtyp ters_bij(const numtyp zeta,
 {
  numtyp tmp = param_beta * zeta;
  if (tmp > param_ca1)
-    return ucl_powr(tmp, -param_powern/((numtyp)2.0*param_powern_del));
+    return ucl_pow(tmp, -param_powern/((numtyp)2.0*param_powern_del));
  if (tmp < param_ca4) return (numtyp)1.0;
-  return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
+  return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern),
    (numtyp)-1.0/((numtyp)2.0*param_powern_del));
 }

@ -180,12 +182,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
 {
  numtyp tmp = param_beta * zeta;
  if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) *
-          ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
+          ucl_pow(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
  if (tmp < param_ca4) return (numtyp)0.0;

-  numtyp tmp_n = ucl_powr(tmp,param_powern);
+  numtyp tmp_n = ucl_pow(tmp,param_powern);
  return (numtyp)-0.5 *(param_powern/param_powern_del) *
-          ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
+          ucl_pow((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
    ((numtyp)2.0*param_powern_del)))*tmp_n / zeta;
 }

--- a/lib/gpu/lal_tersoff_zbl_extra.h
+++ b/lib/gpu/lal_tersoff_zbl_extra.h
@ -37,14 +37,16 @@ ucl_inline void vec3_add(const numtyp x[3], const numtyp y[3], numtyp z[3])
  z[0] = x[0]+y[0]; z[1] = x[1]+y[1]; z[2] = x[2]+y[2];
 }

-ucl_inline void vec3_scale(const numtyp k, const numtyp x[3], numtyp y[3])
+ucl_inline void vec3_scale(const numtyp k, numtyp x[3], numtyp y[3])
 {
+  // return y = k * x  (y can be at the same address as x)
  y[0] = k*x[0]; y[1] = k*x[1]; y[2] = k*x[2];
 }

-ucl_inline void vec3_scaleadd(const numtyp k, const numtyp x[3],
+ucl_inline void vec3_scaleadd(const numtyp k, numtyp x[3],
                              const numtyp y[3], numtyp z[3])
 {
+  // return z = k * x + y  (z can be at the same address as x)
  z[0] = k*x[0]+y[0]; z[1] = k*x[1]+y[1]; z[2] = k*x[2]+y[2];
 }

@ -85,9 +87,9 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
                           const numtyp rij,
                           const numtyp rik_hat[3],
                           const numtyp rik,
-                           numtyp *dri,
-                           numtyp *drj,
-                           numtyp *drk)
+                           numtyp dri[3],
+                           numtyp drj[3],
+                           numtyp drk[3])
 {
  // first element is derivative wrt Ri, second wrt Rj, third wrt Rk

@ -187,12 +189,12 @@ ucl_inline numtyp ters_bij(const numtyp zeta,
  numtyp tmp = param_beta * zeta;
  if (tmp > param_c1) return ucl_rsqrt(tmp);
  if (tmp > param_c2)
-    return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) /
+    return ((numtyp)1.0 - ucl_pow(tmp,-param_powern) /
      ((numtyp)2.0*param_powern))*ucl_rsqrt(tmp);
  if (tmp < param_c4) return (numtyp)1.0;
  if (tmp < param_c3)
-    return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
-  return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
+    return (numtyp)1.0 - ucl_pow(tmp,param_powern)/((numtyp)2.0*param_powern);
+  return ucl_pow((numtyp)1.0 + ucl_pow(tmp,param_powern),
    (numtyp)-1.0/((numtyp)2.0*param_powern));
 }

@ -208,19 +210,19 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
 {
  numtyp tmp = param_beta * zeta;
  if (tmp > param_c1)
-    return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
+    return param_beta * (numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5);
  if (tmp > param_c2)
-    return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
+    return param_beta * ((numtyp)-0.5*ucl_pow(tmp,(numtyp)-1.5) *
    // error in negligible 2nd term fixed 9/30/2015
                // (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
      ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
-       ucl_powr(tmp,-param_powern)));
+       ucl_pow(tmp,-param_powern)));
  if (tmp < param_c4) return (numtyp)0.0;
  if (tmp < param_c3)
-    return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
+    return (numtyp)-0.5*param_beta * ucl_pow(tmp,param_powern-(numtyp)1.0);

-  numtyp tmp_n = ucl_powr(tmp,param_powern);
-  return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) -
+  numtyp tmp_n = ucl_pow(tmp,param_powern);
+  return (numtyp)-0.5 * ucl_pow((numtyp)1.0+tmp_n, (numtyp) -
    (numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta;
 }

@ -474,7 +476,7 @@ ucl_inline void repulsive(const numtyp param_bigr,

  numtyp esq = global_e*global_e;
  numtyp a_ij = ((numtyp)0.8854*global_a_0) /
-    (ucl_powr(param_Z_i,(numtyp)0.23) + ucl_powr(param_Z_j,(numtyp)0.23));
+    (ucl_pow(param_Z_i,(numtyp)0.23) + ucl_pow(param_Z_j,(numtyp)0.23));
  numtyp premult = (param_Z_i * param_Z_j * esq)/((numtyp)4.0*MY_PI*global_epsilon_0);
  numtyp r_ov_a = r/a_ij;
  numtyp t1 = (numtyp)0.1818*ucl_exp((numtyp)-3.2*r_ov_a);
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -353,7 +353,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
        numtyp r4inv = rinvsq*rinvsq;
        numtyp r6inv = rinvsq*r4inv;

-        numtyp reta = pow(r,-param1_eta);
+        numtyp reta = ucl_powr(r,-param1_eta);
        numtyp lam1r = r*param1_lam1inv;
        numtyp lam4r = r*param1_lam4inv;
        numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;