Trying to find the difference in the neighbor list build in hippo vs amoeba

2021-09-27 11:35:35 -05:00
parent 7437c98628
commit 2efd841a7e
6 changed files with 114 additions and 62 deletions
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,

  // leave the answers (forces, energies and virial) on the device,
  //   only copy them back in the last kernel (this one, or polar_real once done)
-  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //this->device->add_ans_object(this->ans);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);

  this->hd_balancer.stop_timer();

--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -1032,6 +1032,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
      numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
      numtyp valk = polar6[j].x;

+      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
      // intermediates involving moments and separation distance

      numtyp dir = dix*xr + diy*yr + diz*zr;
@ -1800,6 +1801,11 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
      factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
+      // NOTE: for in.water_box/water_hexamer.hippo: there exist wscale = 0.2
+      //if (factor_wscale < (numtyp)1.0) continue; //factor_wscale = (numtyp)0;
+
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, r, factor_wscale);
+
      if (igroup == jgroup) {
        factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
        factor_uscale = polar_uscale;
@ -1911,6 +1917,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
      dufld[5] += zr*tiz5 + zr*zr*tuir;

+
      // get the field gradient for direct polarization force
      
      numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
@ -1929,7 +1936,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      term1k = rr3k - rr5k*xr*xr;
      term2k = (numtyp)2.0*rr5k*xr;
      term3k = rr7k*xr*xr - rr5k;
-      term4k = 2.0*rr5k;
+      term4k = (numtyp)2.0*rr5k;
      term5k = (numtyp)5.0*rr7k*xr;
      term6k = rr9k*xr*xr;
      tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i -
@ -2159,7 +2166,7 @@ __kernel void k_special15(__global int * dev_nbor,
      int which = sj >> SBBITS & 3;
      int j = sj & NEIGHMASK;
      tagint jtag = tag[j];
-
+      if (i == 0 && j < 20) printf("GPU: j = %d; jtag = %d\n", j, jtag);
      if (!which) {
        int offset=ii;
        for (int k=0; k<n15; k++) {
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@ -81,14 +81,14 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
    pre = (numtyp)128.0;
    s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;

-    ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0;
-    d2s = dmpi24 * expi * r5 / 9.0;
-    d3s = dmpi25 * expi * r6 / 45.0;
-    d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0;
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
+    d2s = dmpi24 * expi * r5 / (numtyp)9.0;
+    d3s = dmpi25 * expi * r6 / (numtyp)45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
    if (rorder >= 11) {
      r8 = r7 * r;
      dmpi27 = dmpi2 * dmpi26;
-      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
    }

  // treat the case where alpha damping exponents are unequal
@ -97,12 +97,12 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
    r3 = r2 * r;
    r4 = r3 * r;
    r5 = r4 * r;
-    dmpi2 = 0.5 * dmpi;
-    dmpk2 = 0.5 * dmpk;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dmpk2 = (numtyp)0.5 * dmpk;
    dampi = dmpi2 * r;
    dampk = dmpk2 * r;
-    expi = exp(-dampi);
-    expk = exp(-dampk);
+    expi = ucl_exp(-dampi);
+    expk = ucl_exp(-dampk);
    dmpi22 = dmpi2 * dmpi2;
    dmpi23 = dmpi22 * dmpi2;
    dmpi24 = dmpi23 * dmpi2;
@ -112,34 +112,34 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
    dmpk24 = dmpk23 * dmpk2;
    dmpk25 = dmpk24 * dmpk2;
    term = dmpi22 - dmpk22;
-    pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0);
-    tmp = 4.0 * dmpi2 * dmpk2 / term;
+    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / ucl_powr(term,(numtyp)4.0);
+    tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
    s = (dampi-tmp)*expk + (dampk+tmp)*expi;

-    ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - 
-          4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
-    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - 
-           (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-           4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + 
-       (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
-       4.0*dmpi2*dmpk2/term) * expi;
-    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - 
-           (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - 
-           4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
-      (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + 
-       (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + 
-       4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi;
-    d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + 
-           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - 
-           (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - 
-           (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-           4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + 
-       dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + 
-       (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + 
-       (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - 
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - 
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + 
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - 
+           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - 
+           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + 
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + 
+       (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + 
+           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - 
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - 
+           ((numtyp)12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + 
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + 
+       (4.0/105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/21.0)*dmpi24*dmpk2*r3/term + 
+       (12.0/7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
       4.0*dmpi2*dmpk2/term) * expi;
    
    if (rorder >= 11) {
@ -217,8 +217,8 @@ ucl_inline void damppole(const numtyp r, const int rorder,
  diff = fabs(alphai-alphak);
  dampi = alphai * r;
  dampk = alphak * r;
-  expi = exp(-dampi);
-  expk = exp(-dampk);
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);

  // core-valence charge penetration damping for Gordon f1

@ -308,15 +308,15 @@ ucl_inline void damppole(const numtyp r, const int rorder,
    if (rorder >= 11) {
      dampi6 = dampi3 * dampi3;
      dampk6 = dampk3 * dampk3;
-      dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                                5.0*dampi4/126.0 + 2.0*dampi5/315.0 + 
-                                dampi6/1890.0)*expi - 
-        termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + 
-                2.0*dampk5/315.0 + dampk6/1890.0)*expk - 
-        2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + 
-                          dampi4/63.0 + dampi5/945.0)*expi - 
-        2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + 
-                          dampk4/63.0 + dampk5/945.0)*expk;
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + 
+                                dampi6/(numtyp)1890.0)*expi - 
+        termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + 5.0*dampk4/(numtyp)126.0 + 
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - 
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + 
+                          dampi4/63.0 + dampi5/(numtyp)945.0)*expi - 
+        (numtyp)2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + 
+                          dampk4/63.0 + dampk5/(numtyp)945.0)*expk;
    }
  }
 }
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@ -366,6 +366,7 @@ void PairAmoeba::polar_real()
      yr = x[j][1] - yi;
      zr = x[j][2] - zi;
      r2 = xr*xr + yr*yr + zr*zr;
+
      if (r2 > off2) continue; 

      jtype = amtype[j];
@ -393,7 +394,7 @@ void PairAmoeba::polar_real()
 	  factor_uscale = 1.0;
 	}
      }
-
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
      r = sqrt(r2);
      ck = rpole[j][0];
      dkx = rpole[j][1];
@ -579,7 +580,6 @@ void PairAmoeba::polar_real()
        tkz5 = 2.0 * (psr5*uiz+dsr5*uizp);
        tuir = -psr7*ukr - dsr7*ukrp;
        tukr = -psr7*uir - dsr7*uirp;
-      // reached here...
      } else if (hippo) {
        tix5 = 4.0 * (rr5i*ukx);
        tiy5 = 4.0 * (rr5i*uky);
@ -597,7 +597,6 @@ void PairAmoeba::polar_real()
      dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
      dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
      dufld[i][5] += zr*tiz5 + zr*zr*tuir;
-
      dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
      dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
      dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@ -855,6 +854,7 @@ void PairAmoeba::polar_real()
        frcx = -2.0 * depx;
        frcy = -2.0 * depy;
        frcz = -2.0 * depz;
+
      }

      // get the dtau/dr terms used for mutual polarization force
@ -1199,6 +1199,8 @@ void PairAmoeba::polar_real()

    torque2force(i,tep,fix,fiy,fiz,fpolar);

+    //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, tep[0], tep[1], tep[2]);
+
    iz = zaxis2local[i];
    ix = xaxis2local[i];
    iy = yaxis2local[i];
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@ -242,6 +242,47 @@ void PairAmoeba::compute(int eflag, int vflag)
    time_induce = time_polar = time_qxfer = 0.0;
  }

+  { // DEBUGGING
+  double **x = atom->x;
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  int i,ii,j,jj;
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double xi = x[i][0];
+    double yi = x[i][1];
+    double zi = x[i][2];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      double factor_mpole = special_mpole[sbmask15(j)];
+      j &= NEIGHMASK15;
+
+      double xr = x[j][0] - xi;
+      double yr = x[j][1] - yi;
+      double zr = x[j][2] - zi;
+      double r2 = xr*xr + yr*yr + zr*zr;
+      if (r2 > off2) continue;
+      double r = sqrt(r2);
+      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
+    }
+  }
+  
+
+  } // DEBUGGING
+
  double evdwl;

  evdwl = 0.0;
@ -973,8 +1014,8 @@ void PairAmoeba::init_style()

  int irequest = neighbor->request(this,instance_me);
  // for DEBUGGING with GPU
-  //neighbor->requests[irequest]->half = 0;
-  //neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;

  // open debug output files
  // names are hard-coded
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  gpu_multipole_real_ready = true;
  gpu_udirect2b_ready = false;
  gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;

  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@ -1137,6 +1137,8 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
    _tq[2] = tq_ptr[4*i+2];
    torque2force(i,_tq,fix,fiy,fiz,force_comp);

+    //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, _tq[0], _tq[1], _tq[2]);
+
    iz = zaxis2local[i];
    ix = xaxis2local[i];
    iy = yaxis2local[i];