From c765861851c3464c9d1c93e90bba8c4e75b28aa0 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 11 Sep 2021 01:00:58 -0500 Subject: [PATCH] Cleaned up and re-arranged the functions to reflect the order of calling in a time step --- lib/gpu/lal_amoeba.cpp | 105 +++-- lib/gpu/lal_amoeba.cu | 890 +++++++++++++++++------------------- lib/gpu/lal_base_amoeba.cpp | 28 +- 3 files changed, 493 insertions(+), 530 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 3a83f57594..6f1e0cfaa9 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -126,49 +126,6 @@ double AmoebaT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Amoeba); } -// --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep -// --------------------------------------------------------------------------- -template -int AmoebaT::polar_real(const int eflag, const int vflag) { - // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - int _nall=this->atom->nall(); - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); - this->time_pair.start(); - - // Build the short neighbor list if needed - if (!this->short_nbor_avail) { - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_off2, &ainum, - &nbor_pitch, &this->_threads_per_atom); - this->short_nbor_avail = true; - } - - this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &this->_tep, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, - &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); - this->time_pair.stop(); - - // Signal that short nbor list is not avail for the next time step - // do it here because polar_real() is the last kernel in a time step at this point - - this->short_nbor_avail = false; - - return GX; -} - // --------------------------------------------------------------------------- // Calculate the real-space permanent field, returning field and fieldp // --------------------------------------------------------------------------- @@ -182,13 +139,13 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); - // Build the short neighbor list if needed + // Build the short neighbor list if not done yet if (!this->short_nbor_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_off2, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); this->short_nbor_avail = true; } @@ -219,9 +176,20 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); + // Build the short neighbor list if not done yet + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } + this->k_umutual2b.set_size(GX,BX); this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &_aewald, &_off2, &_polar_dscale, &_polar_uscale); @@ -230,5 +198,48 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Calculate the polar real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::polar_real(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int _nall=this->atom->nall(); + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } + + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, + &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_avail = false; + + return GX; +} + template class Amoeba; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index bcb3aef309..fb515c69f7 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -185,6 +185,421 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global numtyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp _fieldp[6]; + for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; + + numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + //numtyp4 xi__; + + if (ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find the field components for Thole polarization damping + + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp scale7 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp) ; + scale3 = (numtyp)1.0 - expdamp ; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); + } + } else { + pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_dscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + scalek = factor_pscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // iioff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + numtyp ukx = polar4[j].x; // uind[j][0]; + numtyp uky = polar4[j].y; // uind[j][1]; + numtyp ukz = polar4[j].z; // uind[j][2]; + numtyp ukxp = polar5[j].x; // uinp[j][0]; + numtyp ukyp = polar5[j].y; // uinp[j][1]; + numtyp ukzp = polar5[j].z; // uinp[j][2]; + + numtyp factor_uscale; + + // find terms needed later to compute mutual polarization + // if (poltyp != DIRECT) + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_uscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip + tdipdip[0] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[1] = bcn[1]*xr*yr; + tdipdip[2] = bcn[1]*xr*zr; + tdipdip[3] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[4] = bcn[1]*yr*zr; + tdipdip[5] = -bcn[0] + bcn[1]*zr*zr; + + fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; + fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; + fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; + + fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; + fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; + fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; koff2) continue; + //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); @@ -707,474 +1122,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, offset,eflag,vflag,ans,engv); } -/* ---------------------------------------------------------------------- - udirect2b = Ewald real direct field via list - udirect2b computes the real space contribution of the permanent - atomic multipole moments to the field via a neighbor list -------------------------------------------------------------------------- */ - -__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict damping, - const __global numtyp4 *restrict sp_polar, - const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global numtyp4 *restrict fieldp, - const int inum, const int nall, - const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp off2, - const numtyp polar_dscale, const numtyp polar_uscale) -{ - int tid, ii, offset, i; - atom_info(t_per_atom,ii,tid,offset); - - int n_stride; - local_allocate_store_charge(); - - acctyp _fieldp[6]; - for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - - numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - - //numtyp4 xi__; - - if (ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); - - for ( ; nboroff2) { - if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj); - continue; - } - numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); - numtyp r2inv = rinv*rinv; - numtyp rr1 = rinv; - numtyp rr3 = rr1 * r2inv; - numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; - numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; - - numtyp ck = polar1[j].x; // rpole[j][0]; - numtyp dkx = polar1[j].y; // rpole[j][1]; - numtyp dky = polar1[j].z; // rpole[j][2]; - numtyp dkz = polar1[j].w; // rpole[j][3]; - numtyp qkxx = polar2[j].x; // rpole[j][4]; - numtyp qkxy = polar2[j].y; // rpole[j][5]; - numtyp qkxz = polar2[j].z; // rpole[j][6]; - numtyp qkyy = polar2[j].w; // rpole[j][8]; - numtyp qkyz = polar3[j].x; // rpole[j][9]; - numtyp qkzz = polar3[j].y; // rpole[j][12]; - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; - - numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; - const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; - factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; - if (igroup == jgroup) { - factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; - factor_dscale = polar_dscale; - factor_uscale = polar_uscale; - } else { - factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; - factor_dscale = factor_uscale = (numtyp)1.0; - } - - // intermediates involving moments and separation distance - - numtyp dir = dix*xr + diy*yr + diz*zr; - numtyp qix = qixx*xr + qixy*yr + qixz*zr; - numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; - numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; - numtyp qir = qix*xr + qiy*yr + qiz*zr; - numtyp dkr = dkx*xr + dky*yr + dkz*zr; - numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; - numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; - numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; - numtyp qkr = qkx*xr + qky*yr + qkz*zr; - - // calculate the real space Ewald error function terms - - numtyp ralpha = aewald * r; - numtyp exp2a = ucl_exp(-ralpha*ralpha); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; - bn[0] = _erfc * rinv; - - numtyp aefac = aesq2n; - for (int m = 1; m <= 3; m++) { - numtyp bfac = (numtyp) (m+m-1); - aefac = aesq2 * aefac; - bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; - } - - // find the field components for Thole polarization damping - - numtyp scale3 = (numtyp)1.0; - numtyp scale5 = (numtyp)1.0; - numtyp scale7 = (numtyp)1.0; - numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] - if (damp != (numtyp)0.0) { - numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] - if (pgamma != (numtyp)0.0) { - damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); - if (damp < (numtyp)50.0) { - numtyp expdamp = ucl_exp(-damp) ; - scale3 = (numtyp)1.0 - expdamp ; - scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); - scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); - } - } else { - pgamma = MIN(pti,damping[jtype].y); // thole[jtype] - damp = pgamma * ucl_powr(r/damp,3.0); - if (damp < (numtyp)50.0) { - numtyp expdamp = ucl_exp(-damp); - scale3 = (numtyp)1.0 - expdamp; - scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); - scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); - } - } - } else { // damp == 0: ??? - } - - numtyp scalek = factor_dscale; - bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; - bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; - fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; - fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; - fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; - - scalek = factor_pscale; - bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; - bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; - fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; - fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; - fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; - - _fieldp[0] += fid[0]; - _fieldp[1] += fid[1]; - _fieldp[2] += fid[2]; - _fieldp[3] += fip[0]; - _fieldp[4] += fip[1]; - _fieldp[5] += fip[2]; - } // nbor - - } // iioff2) continue; - - numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); - numtyp r2inv = rinv*rinv; - numtyp rr1 = rinv; - numtyp rr3 = rr1 * r2inv; - numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; - - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; - numtyp ukx = polar4[j].x; // uind[j][0]; - numtyp uky = polar4[j].y; // uind[j][1]; - numtyp ukz = polar4[j].z; // uind[j][2]; - numtyp ukxp = polar5[j].x; // uinp[j][0]; - numtyp ukyp = polar5[j].y; // uinp[j][1]; - numtyp ukzp = polar5[j].z; // uinp[j][2]; - - numtyp factor_uscale; - - // find terms needed later to compute mutual polarization - // if (poltyp != DIRECT) - numtyp scale3 = (numtyp)1.0; - numtyp scale5 = (numtyp)1.0; - numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] - if (damp != (numtyp)0.0) { - numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] - if (pgamma != (numtyp)0.0) { - damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); - if (damp < (numtyp)50.0) { - numtyp expdamp = ucl_exp(-damp); - scale3 = (numtyp)1.0 - expdamp; - scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); - } - } - } else { // damp == 0: ??? - } - - numtyp scalek = factor_uscale; - bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; - bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip - tdipdip[0] = -bcn[0] + bcn[1]*xr*xr; - tdipdip[1] = bcn[1]*xr*yr; - tdipdip[2] = bcn[1]*xr*zr; - tdipdip[3] = -bcn[0] + bcn[1]*yr*yr; - tdipdip[4] = bcn[1]*yr*zr; - tdipdip[5] = -bcn[0] + bcn[1]*zr*zr; - - fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; - fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; - fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; - - fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; - fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; - fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; - - _fieldp[0] += fid[0]; - _fieldp[1] += fid[1]; - _fieldp[2] += fid[2]; - _fieldp[3] += fip[0]; - _fieldp[4] += fip[1]; - _fieldp[5] += fip[2]; - } // nbor - - } // ii> SBBITS & 3; - int j = sj & NEIGHMASK; - tagint jtag = tag[j]; - - if (!which) { - int offset=ii; - for (int k=0; k1) { - for (unsigned int s=t_per_atom/2; s>0; s>>=1) - new_numj += shfl_down(new_numj, s, t_per_atom); - } - if (offset==0 && iidev_nbor); } - bool allocate_packed = false; + bool alloc_packed=false; success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, - _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom); + _gpu_host,max_nbors,cell_size,alloc_packed,_threads_per_atom); if (success!=0) return success; @@ -231,8 +231,6 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, add_onefive_neighbors(); } - //nbor->copy_unpacked(inum,mn); - double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -336,17 +334,17 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f template int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **&ilist, int **&jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd) { + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { acc_timers(); int eflag, vflag; if (eatom) eflag=2;