Added the dispersion real space kernel and transfer special coeffs to the device

2021-09-19 23:40:43 -05:00
parent 1166845fcf
commit 0228867d8e
6 changed files with 153 additions and 53 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -47,6 +47,9 @@ template <class numtyp, class acctyp>
 int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                  const double *host_pdamp, const double *host_thole,
                  const double *host_dirdamp, const int *host_amtype2class,
+                  const double *host_special_hal,
+                  const double *host_special_repel,
+                  const double *host_special_disp,
                  const double *host_special_mpole,
                  const double *host_special_polar_wscale,
                  const double *host_special_polar_piscale,
@ -109,12 +112,21 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
  }
  ucl_copy(sp_polar,dview,5,false);

+  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_hal[i];
+    dview[i].y=host_special_repel[i];
+    dview[i].z=host_special_disp[i];
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_nonpolar,dview,5,false);
+
  _polar_dscale = polar_dscale;
  _polar_uscale = polar_uscale;

  _allocated=true;
  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
-    + sp_polar.row_bytes() + this->_tep.row_bytes();
+    + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
  return 0;
 }

@ -125,7 +137,9 @@ void AmoebaT::clear() {
  _allocated=false;

  coeff_amtype.clear();
+  coeff_amclass.clear();
  sp_polar.clear();
+  sp_nonpolar.clear();
  
  this->clear_atomic();
 }
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -400,8 +400,9 @@ _texture( q_tex,int2);

 __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global numtyp4 *restrict coeff_amtype,
+                                 const __global numtyp4 *restrict coeff_amclass,
+                                 const __global numtyp4 *restrict sp_disp,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
@ -428,20 +429,11 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }

-  acctyp4 tq;
-  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
-
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);

  if (ii<inum) {
-    int m,itype,igroup;
-    numtyp bfac;
-    numtyp term1,term2,term3;
-    numtyp term4,term5,term6;
-    numtyp bn[6];
-    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+    int itype,iclass;
+    numtyp ci,ai;

    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
@ -460,18 +452,10 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
      nbor_mem = dev_short_nbor;
    }

-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
+    itype  = polar3[i].z;            // amtype[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    ci = coeff_amclass[iclass].x;    // csix[iclass];
+    ai = coeff_amclass[iclass].y;    // adisp[iclass];

    for ( ; nbor<nbor_end; nbor+=n_stride) {

@ -482,34 +466,115 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
      //int jtype=jx.w;
 
      // Compute r12
-      numtyp xr = jx.x - ix.x;
-      numtyp yr = jx.y - ix.y;
-      numtyp zr = jx.z - ix.z;
+      numtyp xr = ix.x - jx.x;
+      numtyp yr = ix.y - jx.y;
+      numtyp zr = ix.z - jx.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
  
-      numtyp r = ucl_sqrt(r2);
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
+      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];

+      numtyp r6 = r2*r2*r2;
+      numtyp ralpha2 = r2 * aewald*aewald;
+      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
+      numtyp expterm = ucl_exp(-ralpha2);
+      numtyp expa = expterm * term;
+
+      // find the damping factor for the dispersion interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp r7 = r6 * r;
+      numtyp di = ai * r;
+      numtyp di2 = di * di;
+      numtyp di3 = di * di2;
+      numtyp dk = ak * r;
+      numtyp expi = ucl_exp(-di);
+      numtyp expk = ucl_exp(-dk);
+     
+      numtyp ai2,ak2;
+      numtyp di4,di5;
+      numtyp dk2,dk3;
+      numtyp ti,ti2;
+      numtyp tk,tk2;
+      numtyp damp3,damp5;
+      numtyp ddamp;
+      numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)];
+
+      if (ai != ak) {
+        ai2 = ai * ai;
+        ak2 = ak * ak;
+        dk2 = dk * dk;
+        dk3 = dk * dk2;
+        ti = ak2 / (ak2-ai2);
+        ti2 = ti * ti;
+        tk = ai2 / (ai2-ak2);
+        tk2 = tk * tk;
+        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
+          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
+          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
+        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
+          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
+          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
+        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
+          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
+
+      } else {
+        di4 = di2 * di2;
+        di5 = di2 * di3;
+        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
+        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
+        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
+      }
+
+      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
+      
+      // apply damping and scaling factors for this interaction
+
+      numtyp scale = factor_disp * damp*damp;
+      scale = scale - (numtyp )1.0;
+      numtyp e = -ci * ck * (expa+scale) / r6;
+      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
+      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
+
+      energy+= e;
+
+      // increment the damped dispersion derivative components
+
+      numtyp dedx = de * xr;
+      numtyp dedy = de * yr;
+      numtyp dedz = de * zr;
+      f.x += dedx;
+      f.y += dedy;
+      f.z += dedz;
+      
+      // increment the internal virial tensor components
+
+      numtyp vxx = xr * dedx;
+      numtyp vyx = yr * dedx;
+      numtyp vzx = zr * dedx;
+      numtyp vyy = yr * dedy;
+      numtyp vzy = zr * dedy;
+      numtyp vzz = zr * dedz;
+
+      virial[0] += vxx;
+      virial[1] += vyy;
+      virial[2] += vzz;
+      virial[3] += vyx;
+      virial[4] += vzx;
+      virial[5] += vzy;
    } // nbor
    
  } // ii<inum

  // accumate force, energy and virial
-  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv);
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
 }

 /* ----------------------------------------------------------------------
@ -556,7 +621,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);

  if (ii<inum) {
-    int m,itype,igroup;
+    int m;
    numtyp bfac;
    numtyp term1,term2,term3;
    numtyp term4,term5,term6;
@ -590,8 +655,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    qiyy = polar2[i].w;   // rpole[i][8];
    qiyz   = polar3[i].x; // rpole[i][9];
    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];

    for ( ; nbor<nbor_end; nbor+=n_stride) {

@ -1391,9 +1454,8 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp ukyp = polar5[j].y; // uinp[j][1];
      numtyp ukzp = polar5[j].z; // uinp[j][2];

-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_dscale, factor_pscale, factor_uscale;
      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
      if (igroup == jgroup) {
        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
        factor_dscale = polar_dscale;
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -40,6 +40,8 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
  int init(const int ntypes, const int max_amtype, const int max_amclass,
           const double *host_pdamp, const double *host_thole, const double *host_dirdamp,
           const int *host_amtype2class, const double *host_special_mpole,
+           const double *host_special_hal, const double *host_special_repel,
+           const double *host_special_disp,
           const double *host_special_polar_wscale,
           const double *host_special_polar_piscale,
           const double *host_special_polar_pscale,
@ -70,7 +72,13 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
  ///   sp_polar.x = special_polar_wscale
  ///   sp_polar.y special_polar_pscale,
  ///   sp_polar.z = special_polar_piscale
+  ///   sp_polar.w = special_mpole
  UCL_D_Vec<numtyp4> sp_polar;
+  /// Special nonpolar values [0-4]: 
+  ///   sp_nonpolar.x = special_hal
+  ///   sp_nonpolar.y special_repel
+  ///   sp_nonpolar.z = special_disp
+  UCL_D_Vec<numtyp4> sp_nonpolar;

  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -30,6 +30,9 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                    const double *host_pdamp, const double *host_thole,
                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_hal,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
                    const double *host_special_mpole,
                    const double *host_special_polar_wscale,
                    const double *host_special_polar_piscale,
@ -66,7 +69,9 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
  if (world_me==0)
    init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
                          host_pdamp, host_thole, host_dirdamp,
-                          host_amtype2class, host_special_mpole, host_special_polar_wscale,
+                          host_amtype2class, host_special_hal,
+                          host_special_repel, host_special_disp,
+                          host_special_mpole, host_special_polar_wscale,
                          host_special_polar_piscale, host_special_polar_pscale,
                          host_csix, host_adisp, nlocal, nall, max_nbors,
                          maxspecial, maxspecial15, cell_size, gpu_split,
@ -86,8 +91,11 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp,
-                            host_amtype2class, host_special_mpole, host_special_polar_wscale,
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                            host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_hal,
+                            host_special_repel, host_special_disp,
+                            host_special_mpole, host_special_polar_wscale,
                            host_special_polar_piscale, host_special_polar_pscale,
                            host_csix, host_adisp, nlocal, nall, max_nbors,
                            maxspecial, maxspecial15, cell_size, gpu_split,
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -53,7 +53,8 @@ enum{GORDON1,GORDON2};
 int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                    const double *host_pdamp, const double *host_thole,
                    const double *host_dirdamp, const int* host_amtype2class,
-                    const double *host_special_mpole,
+                    const double *host_special_hal, const double *host_special_repel,
+                    const double *host_special_disp, const double *host_special_mpole,
                    const double *host_special_polar_wscale,
                    const double *host_special_polar_piscale,
                    const double *host_special_polar_pscale,
@ -116,6 +117,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  fieldp_pinned = nullptr;
  tq_pinned = nullptr;

+  gpu_hal_ready = false;
+  gpu_repulsion_ready = false;
+  gpu_dispersion_real_ready = false;
  gpu_multipole_real_ready = true;
  gpu_udirect2b_ready = true;
  gpu_umutual2b_ready = true;
@ -170,7 +174,8 @@ void PairAmoebaGPU::init_style()
  int tq_size;
  int mnf = 5e-2 * neighbor->oneatom;
  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
-                                pdamp, thole, dirdamp, amtype2class, special_mpole,
+                                pdamp, thole, dirdamp, amtype2class, special_hal,
+                                special_repel, special_disp, special_mpole,
                                special_polar_wscale, special_polar_piscale,
                                special_polar_pscale, csix, adisp, atom->nlocal,
                                atom->nlocal+atom->nghost, mnf, maxspecial,
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -47,6 +47,9 @@ class PairAmoebaGPU : public PairAmoeba {
  void *fieldp_pinned;
  bool tq_single;

+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
  bool gpu_multipole_real_ready;
  bool gpu_udirect2b_ready;
  bool gpu_umutual2b_ready;