Working on the umutual2b kernel, the tdipdip values are computed on the fly for now, maybe a seprate neigh list as in the CPU version will be more efficient

2021-09-09 16:52:27 -05:00
parent efe0bf593f
commit b654f293ee
2 changed files with 41 additions and 108 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -743,9 +743,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
    int itype,igroup;
    numtyp bn[4],bcn[3];
    numtyp fid[3],fip[3];
-    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
    
-    ci  = polar1[i].x;    // rpole[i][0];
    dix = polar1[i].y;    // rpole[i][1];
    diy = polar1[i].z;    // rpole[i][2];
    diz = polar1[i].w;    // rpole[i][3];
@ -934,10 +932,9 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
  acctyp _fieldp[6];
  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;

-  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);

  //numtyp4 xi__;

@ -953,32 +950,13 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
    int itype,igroup;
    numtyp bn[4],bcn[3];
    numtyp fid[3],fip[3];
-    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
    
-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];
    
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
    numtyp ddi = damping[itype].z;

-    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
-    numtyp aesq2n = (numtyp)0.0;
-    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
-
    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=dev_packed[nbor];
@ -1001,106 +979,54 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
      numtyp rr1 = rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
-      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;

-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
      int jtype =   polar3[j].z; // amtype[j];
      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];

-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
-      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
-      } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = (numtyp)1.0;
-      }
-
-      // intermediates involving moments and separation distance
-
-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
-      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
-      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
-      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
-      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
-      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-
-      // calculate the real space Ewald error function terms
-
-      numtyp ralpha = aewald * r;
-      numtyp exp2a = ucl_exp(-ralpha*ralpha);
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
-      bn[0] = _erfc * rinv;
-
-      numtyp aefac = aesq2n;
-      for (int m = 1; m <= 3; m++) {
-        numtyp bfac = (numtyp) (m+m-1);
-        aefac = aesq2 * aefac;
-        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
-      }
-
-      // find the field components for Thole polarization damping
+      numtyp factor_uscale;

+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT) 
      numtyp scale3 = (numtyp)1.0;
      numtyp scale5 = (numtyp)1.0;
-      numtyp scale7 = (numtyp)1.0;
      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
      if (damp != (numtyp)0.0) {
        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
        if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp) ;
-            scale3 = (numtyp)1.0 - expdamp ;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
-          }
-        } else {
-          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
-          damp = pgamma * ucl_powr(r/damp,3.0);
+          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
          if (damp < (numtyp)50.0) {
            numtyp expdamp = ucl_exp(-damp);
            scale3 = (numtyp)1.0 - expdamp;
            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
          }
        }
      } else { // damp == 0: ???
      }

-      numtyp scalek = factor_dscale;
+      numtyp scalek = factor_uscale;
      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+      numtyp tdipdip[6];
+      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
+      tdipdip[1] = bcn[1]*xr*yr;
+      tdipdip[2] = bcn[1]*xr*zr;
+      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
+      tdipdip[4] = bcn[1]*yr*zr;
+      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;

-      scalek = factor_pscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+      
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
      
      _fieldp[0] += fid[0];
      _fieldp[1] += fid[1];
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -431,8 +431,10 @@ void PairAmoebaGPU::induce()

      ufield0c(field,fieldp);

-      crstyle = FIELD;
-      comm->reverse_comm_pair(this);
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }

      for (i = 0; i < nlocal; i++) {
 	      itype = amtype[i];
@ -525,8 +527,11 @@ void PairAmoebaGPU::induce()
 */
    ufield0c(field,fieldp);

-    crstyle = FIELD;
-    comm->reverse_comm_pair(this);
+    if (!gpu_umutual2b_ready) {
+      crstyle = FIELD;
+      comm->reverse_comm_pair(this);
+    }
+
 /*
    if (comm->me == 0) {
      printf("GPU: cutghost = %f\n", comm->cutghost[0]);
@ -593,8 +598,10 @@ void PairAmoebaGPU::induce()

      //error->all(FLERR,"STOP");

-      crstyle = FIELD;
-      comm->reverse_comm_pair(this);
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }
 /*     
     if (comm->me == 0) {
       printf("GPU: iter = %d\n", iter);