diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 67f0877e1a..a3bd653efd 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -45,7 +45,8 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const { template int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_special_polar_wscale, + const double *host_thole, const double *host_dirdamp, + const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const int nlocal, const int nall, const int max_nbors, @@ -76,7 +77,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda for (int i = 0; i < max_amtype; i++) { host_write[i].x = host_pdamp[i]; host_write[i].y = host_thole[i]; - host_write[i].z = (numtyp)0; + host_write[i].z = host_dirdamp[i]; host_write[i].w = (numtyp)0; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index fbda1e0787..1f5fb42438 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -91,6 +91,37 @@ _texture( q_tex,int2); tep[i]=t; \ } +#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \ + i, field, fieldp) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=_fieldp[0]; \ + red_acc[1][tid]=_fieldp[1]; \ + red_acc[2][tid]=_fieldp[2]; \ + red_acc[3][tid]=_fieldp[3]; \ + red_acc[4][tid]=_fieldp[4]; \ + red_acc[5][tid]=_fieldp[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + _fieldp[0]=red_acc[0][tid]; \ + _fieldp[1]=red_acc[1][tid]; \ + _fieldp[2]=red_acc[2][tid]; \ + _fieldp[3]=red_acc[3][tid]; \ + _fieldp[4]=red_acc[4][tid]; \ + _fieldp[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ + _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom); \ + _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom); \ + _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom); \ + _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom); \ + _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = felec * rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find the field components for Thole polarization damping + + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp scale7 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + if (damp < (numtyp)50.0) { + expdamp = ucl_exp(-damp) ; + scale3 = (numtyp)1.0 - expdamp ; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); + } + } else { + pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,3.0); + if (damp < (numtyp)50.0) { + expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_dscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + scalek = factor_pscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + + } // nbor + + } // ii { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_special_polar_wscale, + const double *host_thole, const double *host_dirdamp, + const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const int nlocal, const int nall, const int max_nbors, diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 27c35a810f..a7959ed93e 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -29,6 +29,7 @@ static Amoeba AMOEBAMF; // --------------------------------------------------------------------------- int amoeba_gpu_init(const int ntypes, const int max_amtype, const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -62,7 +63,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, int init_ok=0; if (world_me==0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, + init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, @@ -82,7 +83,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, + init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen,