Cleaned up debugging stuffs and unused variables

2021-09-17 23:13:51 -05:00
parent f5713a52b3
commit 78045d8f76
3 changed files with 89 additions and 101 deletions
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -62,7 +62,7 @@ _texture( q_tex,int2);
    tq.z=red_acc[2][tid];                                                   \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    tep[i]=tq;                                                               \
+    tep[i]=tq;                                                              \
  }

 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
@ -147,14 +147,14 @@ _texture( q_tex,int2);
    fieldp[ii+inum] = fp;                                                   \
  }

-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
                        offset, eflag, vflag, ans, engv, ev_stride)         \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
      if (eflag) {                                                          \
        simdsync();                                                         \
-        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul);         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
      }                                                                     \
      if (vflag) {                                                          \
        simdsync();                                                         \
@ -174,7 +174,7 @@ _texture( q_tex,int2);
    if (eflag!=2 && vflag!=2) {                                             \
      if (eflag) {                                                          \
        simdsync();                                                         \
-        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);               \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
        if (vflag) __syncthreads();                                         \
        if (tid==0) {                                                       \
          engv[ei]+=energy*(acctyp)0.5;                                     \
@ -225,7 +225,7 @@ _texture( q_tex,int2);
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
-    tep[i]=tq;                                                               \
+    tep[i]=tq;                                                              \
  }

 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
@ -280,25 +280,23 @@ _texture( q_tex,int2);

 #if (EVFLAG == 1)

-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                        offset, eflag, vflag, ans, engv, ev_stride)         \
  if (t_per_atom>1) {                                                       \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
    if (vflag==2 || eflag==2) {                                             \
      if (eflag)                                                            \
-        simd_reduce_add2(t_per_atom,energy,e_coul);                                \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
      if (vflag)                                                            \
        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    acctyp4 old=ans[ii];                                                    \
-    if (ii == 0) printf("old = %f %f %f\n", old.x, old.y, old.z);           \
    old.x+=f.x;                                                             \
    old.y+=f.y;                                                             \
    old.z+=f.z;                                                             \
    ans[ii]=old;                                                            \
-    if (ii == 0) printf("new = %f %f %f\n", old.x, old.y, old.z);           \
  }                                                                         \
  if (eflag || vflag) {                                                     \
    if (eflag!=2 && vflag!=2) {                                             \
@ -378,7 +376,7 @@ _texture( q_tex,int2);
 // EVFLAG == 0
 #else

-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                        offset, eflag, vflag, ans, engv, ev_stride)         \
  if (t_per_atom>1)                                                         \
    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
@ -402,20 +400,20 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */

 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
-                                const __global numtyp *restrict extra,
-                                const __global numtyp4 *restrict damping,
-                                const __global numtyp4 *restrict sp_polar,
-                                const __global int *dev_nbor,
-                                const __global int *dev_packed,
-                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
-                                __global acctyp *restrict engv,
-                                __global numtyp4 *restrict tep,
-                                const int eflag, const int vflag, const int inum,
-                                const int nall, const int nbor_pitch, const int t_per_atom,
-                                const numtyp aewald, const numtyp felec,
-                                const numtyp off2, const numtyp polar_dscale,
-                                const numtyp polar_uscale)
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global numtyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
 {
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);
@ -439,14 +437,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);

-  //numtyp4 xi__;
-
  if (ii<inum) {
-    int k,m,itype,igroup;
+    int m,itype,igroup;
    numtyp bfac;
    numtyp term1,term2,term3;
-    numtyp term4,term5;
-    numtyp term6;
+    numtyp term4,term5,term6;
    numtyp bn[6];
    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;

@ -480,9 +475,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];

-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
@ -497,7 +489,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp zr = jx.z - ix.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

-      if (r2>off2) continue;
+      //if (r2>off2) continue;
  
      numtyp r = ucl_sqrt(r2);
      numtyp ck = polar1[j].x;   // rpole[j][0];
@ -613,14 +605,14 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      for (m = 1; m < 6; m++) {
        bfac = (numtyp) (m+m-1);
        alsq2n = alsq2 * alsq2n;
-        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
      }
      for (m = 0; m < 6; m++) bn[m] *= felec;

      term1 = ci*ck;
      term2 = ck*dir - ci*dkr + dik;
-      term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
-      term4 = dir*qkr - dkr*qir - 4.0*qik;
+      term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
      term5 = qir*qkr;
      numtyp scalek = (numtyp)1.0 - factor_mpole;
      rr1 = bn[0] - scalek*rr1;
@ -730,8 +722,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);

-  //numtyp4 xi__;
-
  if (ii<inum) {
    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
@ -1337,7 +1327,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      for (m = 1; m <= 4; m++) {
        bfac = (numtyp) (m+m-1);
        alsq2n = alsq2 * alsq2n;
-        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
      }
      for (m = 0; m < 5; m++) bn[m] *= felec;

--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -60,7 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                    const int nlocal, const int nall, const int max_nbors,
                    const int maxspecial, const int maxspecial15,
                    const double cell_size, int &gpu_mode, FILE *screen,
-                    const double polar_dscale, const double polar_uscale, int& tep_size);
+                    const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();

 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
@ -70,7 +70,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
              bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tep_ptr);
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);

 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
@ -100,7 +100,7 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
              bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tep_ptr);
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);

 double amoeba_gpu_bytes();

@ -113,7 +113,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  cpu_time = 0.0;
  suffix_flag |= Suffix::GPU;
  fieldp_pinned = nullptr;
-  tep_pinned = nullptr;
+  tq_pinned = nullptr;

  gpu_multipole_real_ready = true;
  gpu_udirect2b_ready = true;
@ -166,23 +166,23 @@ void PairAmoebaGPU::init_style()
    maxspecial15=atom->maxspecial15;
  }
    
-  int tep_size;
+  int tq_size;
  int mnf = 5e-2 * neighbor->oneatom;
  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
                                special_mpole, special_polar_wscale, special_polar_piscale,
                                special_polar_pscale, atom->nlocal,
                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                maxspecial15, cell_size, gpu_mode, screen,
-                                polar_dscale, polar_uscale, tep_size);
+                                polar_dscale, polar_uscale, tq_size);
  GPU_EXTRA::check_flag(success,error,world);

  if (gpu_mode == GPU_FORCE)
    error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");

-  if (tep_size == sizeof(double))
-    tep_single = false;
+  if (tq_size == sizeof(double))
+    tq_single = false;
  else
-    tep_single = true;
+    tq_single = true;
 }

 /* ---------------------------------------------------------------------- */
@ -231,19 +231,19 @@ void PairAmoebaGPU::multipole_real()
                                                 eflag, vflag, eflag_atom, vflag_atom,
                                                 host_start, &ilist, &numneigh, cpu_time,
                                                 success, aewald, felec, off2, atom->q,
-                                                 domain->boxlo, domain->prd, &tep_pinned);
+                                                 domain->boxlo, domain->prd, &tq_pinned);
  
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

  // reference to the tep array from GPU lib
-  printf("compute multipole real\n");
-  if (tep_single) {
-    float *tep_ptr = (float *)tep_pinned;
-    compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
+
+  if (tq_single) {
+    float *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, fmpole, virmpole);
  } else {
-    double *tep_ptr = (double *)tep_pinned;
-    compute_force_from_tep<double>(tep_ptr, fmpole, virmpole);
+    double *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, fmpole, virmpole);
  }
 }

@ -681,7 +681,6 @@ void PairAmoebaGPU::induce()
  }
 }

-
 /* ----------------------------------------------------------------------
   udirect2b = Ewald real direct field via list
   udirect2b computes the real space contribution of the permanent
@ -721,19 +720,20 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
  else choose(POLAR);

  firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup, rpole, uind, uinp,
-                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, aewald, off2, atom->q, domain->boxlo,
-                                        domain->prd, &fieldp_pinned);
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success, aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

  // rebuild dipole-dipole pair list and store pairwise dipole matrices
  // done one atom at a time in real-space double loop over atoms & neighs
-
+  // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
  // udirect2b_cpu();

  // accumulate the field and fieldp values from the GPU lib
@ -945,13 +945,14 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
  else choose(POLAR);

  firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup, rpole, uind, uinp,
-                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success,aewald, off2, atom->q, domain->boxlo,
-                                        domain->prd, &fieldp_pinned);
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success,aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

@ -1017,37 +1018,37 @@ void PairAmoebaGPU::polar_real()
  double felec = 0.5 * electric / am_dielectric;

  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup,
-                                        rpole, uind, uinp, sublo, subhi,
-                                        atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, aewald, felec, off2, atom->q, domain->boxlo,
-                                        domain->prd, &tep_pinned);
+                                             atom->type, amtype, amgroup,
+                                             rpole, uind, uinp, sublo, subhi,
+                                             atom->tag, atom->nspecial, atom->special,
+                                             atom->nspecial15, atom->special15,
+                                             eflag, vflag, eflag_atom, vflag_atom,
+                                             host_start, &ilist, &numneigh, cpu_time,
+                                             success, aewald, felec, off2, atom->q,
+                                             domain->boxlo, domain->prd, &tq_pinned);
  
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

  // reference to the tep array from GPU lib
-  printf("compute polar real\n");
-  if (tep_single) {
-    float *tep_ptr = (float *)tep_pinned;
-    compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
+
+  if (tq_single) {
+    float *tep_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tep_ptr, fpolar, virpolar);
  } else {
-    double *tep_ptr = (double *)tep_pinned;
-    compute_force_from_tep<double>(tep_ptr, fpolar, virpolar);
+    double *tep_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tep_ptr, fpolar, virpolar);
  }
 }

 /* ----------------------------------------------------------------------
-   init specific to this pair style
+   compute atom forces from torques
 ------------------------------------------------------------------------- */

 template <class numtyp>
-void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
-                                           double** force_comp,
-                                           double* virial_comp)
+void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
+                                              double** force_comp,
+                                              double* virial_comp)
 {
  int i,ix,iy,iz;
  double xix,yix,zix;
@ -1055,19 +1056,16 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
  double xiz,yiz,ziz;
  double vxx,vyy,vzz;
  double vxy,vxz,vyz;
-  double fix[3],fiy[3],fiz[3],tep[4];
+  double fix[3],fiy[3],fiz[3],_tq[4];

  double** x = atom->x;
  int nlocal = atom->nlocal;

  for (i = 0; i < nlocal; i++) {
-    tep[0] = tep_ptr[4*i];
-    tep[1] = tep_ptr[4*i+1];
-    tep[2] = tep_ptr[4*i+2];
-
-    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
-    torque2force(i,tep,fix,fiy,fiz,force_comp);
-    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
+    _tq[0] = tq_ptr[4*i];
+    _tq[1] = tq_ptr[4*i+1];
+    _tq[2] = tq_ptr[4*i+2];
+    torque2force(i,_tq,fix,fiy,fiz,force_comp);

    iz = zaxis2local[i];
    ix = xaxis2local[i];
@ -1092,7 +1090,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
+
    virial_comp[0] += vxx;
    virial_comp[1] += vyy;
    virial_comp[2] += vzz;
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -43,9 +43,9 @@ class PairAmoebaGPU : public PairAmoeba {
 private:
  int gpu_mode;
  double cpu_time;
-  void *tep_pinned;
+  void *tq_pinned;
  void *fieldp_pinned;
-  bool tep_single;
+  bool tq_single;

  bool gpu_multipole_real_ready;
  bool gpu_udirect2b_ready;
@ -55,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba {
  void udirect2b_cpu();

  template<class numtyp>
-  void compute_force_from_tep(const numtyp*, double**, double*);
+  void compute_force_from_torque(const numtyp*, double**, double*);
 };

 }    // namespace LAMMPS_NS