Added the dispersion real space kernel and transfer special coeffs to the device

2021-09-19 23:40:43 -05:00
parent 1166845fcf
commit 0228867d8e
6 changed files with 153 additions and 53 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -47,6 +47,9 @@ template <class numtyp, class acctyp>
 int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                  const double *host_pdamp, const double *host_thole,
                  const double *host_dirdamp, const int *host_amtype2class,
                  const double *host_special_hal,
                  const double *host_special_repel,
                  const double *host_special_disp,
                  const double *host_special_mpole,
                  const double *host_special_polar_wscale,
                  const double *host_special_polar_piscale,
@ -109,12 +112,21 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
  }
  ucl_copy(sp_polar,dview,5,false);
  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
  for (int i=0; i<5; i++) {
    dview[i].x=host_special_hal[i];
    dview[i].y=host_special_repel[i];
    dview[i].z=host_special_disp[i];
    dview[i].w=(numtyp)0;
  }
  ucl_copy(sp_nonpolar,dview,5,false);
  _polar_dscale = polar_dscale;
  _polar_uscale = polar_uscale;
  _allocated=true;
  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
-    + sp_polar.row_bytes() + this->_tep.row_bytes();
+    + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
  return 0;
 }
@ -125,7 +137,9 @@ void AmoebaT::clear() {
  _allocated=false;
  coeff_amtype.clear();
  coeff_amclass.clear();
  sp_polar.clear();
  sp_nonpolar.clear();
  this->clear_atomic();
 }
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -400,8 +400,9 @@ _texture( q_tex,int2);
 __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict coeff_amtype,
-                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global numtyp4 *restrict coeff_amclass,
                                 const __global numtyp4 *restrict sp_disp,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
                                 const __global int *dev_short_nbor,
@ -428,20 +429,11 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }
  acctyp4 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
  numtyp4* polar1 = (numtyp4*)(&extra[0]);
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
  if (ii<inum) {
-    int m,itype,igroup;
+    int itype,iclass;
-    numtyp bfac;
+    numtyp ci,ai;
    numtyp term1,term2,term3;
    numtyp term4,term5,term6;
    numtyp bn[6];
    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
@ -460,18 +452,10 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
      nbor_mem = dev_short_nbor;
    }
    ci  = polar1[i].x;    // rpole[i][0];
    dix = polar1[i].y;    // rpole[i][1];
    diy = polar1[i].z;    // rpole[i][2];
    diz = polar1[i].w;    // rpole[i][3];
    qixx = polar2[i].x;   // rpole[i][4];
    qixy = polar2[i].y;   // rpole[i][5];
    qixz = polar2[i].z;   // rpole[i][6];
    qiyy = polar2[i].w;   // rpole[i][8];
    qiyz   = polar3[i].x; // rpole[i][9];
    qizz   = polar3[i].y; // rpole[i][12];
    itype  = polar3[i].z;            // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
    ci = coeff_amclass[iclass].x;    // csix[iclass];
    ai = coeff_amclass[iclass].y;    // adisp[iclass];
    for ( ; nbor<nbor_end; nbor+=n_stride) {
@ -482,34 +466,115 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
      //int jtype=jx.w;
      // Compute r12
-      numtyp xr = jx.x - ix.x;
+      numtyp xr = ix.x - jx.x;
-      numtyp yr = jx.y - ix.y;
+      numtyp yr = ix.y - jx.y;
-      numtyp zr = jx.z - ix.z;
+      numtyp zr = ix.z - jx.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;
      //if (r2>off2) continue;
      numtyp r = ucl_sqrt(r2);
      numtyp ck = polar1[j].x;   // rpole[j][0];
      numtyp dkx = polar1[j].y;  // rpole[j][1];
      numtyp dky = polar1[j].z;  // rpole[j][2];
      numtyp dkz = polar1[j].w;  // rpole[j][3];
      numtyp qkxx = polar2[j].x; // rpole[j][4];
      numtyp qkxy = polar2[j].y; // rpole[j][5];
      numtyp qkxz = polar2[j].z; // rpole[j][6];
      numtyp qkyy = polar2[j].w; // rpole[j][8];
      numtyp qkyz = polar3[j].x; // rpole[j][9];
      numtyp qkzz = polar3[j].y; // rpole[j][12];
      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];
      numtyp r6 = r2*r2*r2;
      numtyp ralpha2 = r2 * aewald*aewald;
      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
      numtyp expterm = ucl_exp(-ralpha2);
      numtyp expa = expterm * term;
      // find the damping factor for the dispersion interaction
      numtyp r = ucl_sqrt(r2);
      numtyp r7 = r6 * r;
      numtyp di = ai * r;
      numtyp di2 = di * di;
      numtyp di3 = di * di2;
      numtyp dk = ak * r;
      numtyp expi = ucl_exp(-di);
      numtyp expk = ucl_exp(-dk);
      numtyp ai2,ak2;
      numtyp di4,di5;
      numtyp dk2,dk3;
      numtyp ti,ti2;
      numtyp tk,tk2;
      numtyp damp3,damp5;
      numtyp ddamp;
      numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)];
      if (ai != ak) {
        ai2 = ai * ai;
        ak2 = ak * ak;
        dk2 = dk * dk;
        dk3 = dk * dk2;
        ti = ak2 / (ak2-ai2);
        ti2 = ti * ti;
        tk = ai2 / (ai2-ak2);
        tk2 = tk * tk;
        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
      } else {
        di4 = di2 * di2;
        di5 = di2 * di3;
        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
      }
      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
      // apply damping and scaling factors for this interaction
      numtyp scale = factor_disp * damp*damp;
      scale = scale - (numtyp )1.0;
      numtyp e = -ci * ck * (expa+scale) / r6;
      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
      energy+= e;
      // increment the damped dispersion derivative components
      numtyp dedx = de * xr;
      numtyp dedy = de * yr;
      numtyp dedz = de * zr;
      f.x += dedx;
      f.y += dedy;
      f.z += dedz;
      // increment the internal virial tensor components
      numtyp vxx = xr * dedx;
      numtyp vyx = yr * dedx;
      numtyp vzx = zr * dedx;
      numtyp vyy = yr * dedy;
      numtyp vzy = zr * dedy;
      numtyp vzz = zr * dedz;
      virial[0] += vxx;
      virial[1] += vyy;
      virial[2] += vzz;
      virial[3] += vyx;
      virial[4] += vzx;
      virial[5] += vzy;
    } // nbor
  } // ii<inum
  // accumate force, energy and virial
-  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv);
+     offset,eflag,vflag,ans,engv);
 }
 /* ----------------------------------------------------------------------
@ -556,7 +621,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
  if (ii<inum) {
-    int m,itype,igroup;
+    int m;
    numtyp bfac;
    numtyp term1,term2,term3;
    numtyp term4,term5,term6;
@ -590,8 +655,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    qiyy = polar2[i].w;   // rpole[i][8];
    qiyz   = polar3[i].x; // rpole[i][9];
    qizz   = polar3[i].y; // rpole[i][12];
    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];
    for ( ; nbor<nbor_end; nbor+=n_stride) {
@ -1391,9 +1454,8 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp ukyp = polar5[j].y; // uinp[j][1];
      numtyp ukzp = polar5[j].z; // uinp[j][2];
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_dscale, factor_pscale, factor_uscale;
      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
      if (igroup == jgroup) {
        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
        factor_dscale = polar_dscale;
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -40,6 +40,8 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
  int init(const int ntypes, const int max_amtype, const int max_amclass,
           const double *host_pdamp, const double *host_thole, const double *host_dirdamp,
           const int *host_amtype2class, const double *host_special_mpole,
           const double *host_special_hal, const double *host_special_repel,
           const double *host_special_disp,
           const double *host_special_polar_wscale,
           const double *host_special_polar_piscale,
           const double *host_special_polar_pscale,
@ -70,7 +72,13 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
  ///   sp_polar.x = special_polar_wscale
  ///   sp_polar.y special_polar_pscale,
  ///   sp_polar.z = special_polar_piscale
  ///   sp_polar.w = special_mpole
  UCL_D_Vec<numtyp4> sp_polar;
  /// Special nonpolar values [0-4]: 
  ///   sp_nonpolar.x = special_hal
  ///   sp_nonpolar.y special_repel
  ///   sp_nonpolar.z = special_disp
  UCL_D_Vec<numtyp4> sp_nonpolar;
  /// If atom type constants fit in shared memory, use fast kernels
  bool shared_types;
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -30,6 +30,9 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                    const double *host_pdamp, const double *host_thole,
                    const double *host_dirdamp, const int *host_amtype2class,
                    const double *host_special_hal,
                    const double *host_special_repel,
                    const double *host_special_disp,
                    const double *host_special_mpole,
                    const double *host_special_polar_wscale,
                    const double *host_special_polar_piscale,
@ -66,7 +69,9 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
  if (world_me==0)
    init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
                          host_pdamp, host_thole, host_dirdamp,
-                          host_amtype2class, host_special_mpole, host_special_polar_wscale,
+                          host_amtype2class, host_special_hal,
                          host_special_repel, host_special_disp,
                          host_special_mpole, host_special_polar_wscale,
                          host_special_polar_piscale, host_special_polar_pscale,
                          host_csix, host_adisp, nlocal, nall, max_nbors,
                          maxspecial, maxspecial15, cell_size, gpu_split,
@ -86,8 +91,11 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
      fflush(screen);
    }
    if (gpu_rank==i && world_me!=0)
-      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp,
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
-                            host_amtype2class, host_special_mpole, host_special_polar_wscale,
+                            host_pdamp, host_thole, host_dirdamp,
                            host_amtype2class, host_special_hal,
                            host_special_repel, host_special_disp,
                            host_special_mpole, host_special_polar_wscale,
                            host_special_polar_piscale, host_special_polar_pscale,
                            host_csix, host_adisp, nlocal, nall, max_nbors,
                            maxspecial, maxspecial15, cell_size, gpu_split,
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -53,7 +53,8 @@ enum{GORDON1,GORDON2};
 int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                    const double *host_pdamp, const double *host_thole,
                    const double *host_dirdamp, const int* host_amtype2class,
-                    const double *host_special_mpole,
+                    const double *host_special_hal, const double *host_special_repel,
                    const double *host_special_disp, const double *host_special_mpole,
                    const double *host_special_polar_wscale,
                    const double *host_special_polar_piscale,
                    const double *host_special_polar_pscale,
@ -116,6 +117,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  fieldp_pinned = nullptr;
  tq_pinned = nullptr;
  gpu_hal_ready = false;
  gpu_repulsion_ready = false;
  gpu_dispersion_real_ready = false;
  gpu_multipole_real_ready = true;
  gpu_udirect2b_ready = true;
  gpu_umutual2b_ready = true;
@ -170,7 +174,8 @@ void PairAmoebaGPU::init_style()
  int tq_size;
  int mnf = 5e-2 * neighbor->oneatom;
  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
-                                pdamp, thole, dirdamp, amtype2class, special_mpole,
+                                pdamp, thole, dirdamp, amtype2class, special_hal,
                                special_repel, special_disp, special_mpole,
                                special_polar_wscale, special_polar_piscale,
                                special_polar_pscale, csix, adisp, atom->nlocal,
                                atom->nlocal+atom->nghost, mnf, maxspecial,
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -47,6 +47,9 @@ class PairAmoebaGPU : public PairAmoeba {
  void *fieldp_pinned;
  bool tq_single;
  bool gpu_hal_ready;
  bool gpu_repulsion_ready;
  bool gpu_dispersion_real_ready;
  bool gpu_multipole_real_ready;
  bool gpu_udirect2b_ready;
  bool gpu_umutual2b_ready;