diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 40da00f176..997e7b21ed 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -143,7 +143,7 @@ class BaseAmoeba { double *charge, double *boxlo, double *prd); /// Compute multipole real-space with device neighboring - int** compute_multipole_real(const int ago, const int inum_full, const int nall, + virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, @@ -155,7 +155,7 @@ class BaseAmoeba { double *boxlo, double *prd, void **tep_ptr); /// Compute the real space part of the permanent field (udirect2b) with device neighboring - int** compute_udirect2b(const int ago, const int inum_full, const int nall, + virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -169,7 +169,7 @@ class BaseAmoeba { double *boxlo, double *prd, void **fieldp_ptr); /// Compute the real space part of the induced field (umutual2b) with device neighboring - int** compute_umutual2b(const int ago, const int inum_full, const int nall, + virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -183,7 +183,7 @@ class BaseAmoeba { double *boxlo, double *prd, void **fieldp_ptr); /// Compute polar real-space with device neighboring - int** compute_polar_real(const int ago, const int inum_full, const int nall, + virtual int** compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 07f8732bcb..fad749a185 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -56,6 +56,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -69,7 +70,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, if (success!=0) return success; + // specific to HIPPO k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion"); + _pval.alloc(this->_max_tep_size,*(this->ucl_device),UCL_READ_ONLY,UCL_READ_ONLY); // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; @@ -98,8 +101,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, for (int i = 0; i < max_amclass; i++) { host_write2[i].x = host_csix[i]; host_write2[i].y = host_adisp[i]; - host_write2[i].z = (numtyp)0; - host_write2[i].w = (numtyp)0; + host_write2[i].z = host_pcore[i]; + host_write2[i].w = host_palpha[i]; } coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); @@ -262,6 +265,93 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute multipole real-space +// --------------------------------------------------------------------------- +template +int** HippoT::compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double felec, + const double off2_mpole, double *host_q, + double *boxlo, double *prd, void **tep_ptr) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + + this->_off2_mpole = off2_mpole; + this->_felec = felec; + this->_aewald = aewald; + const int red_blocks=multipole_real(eflag,vflag); + + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + + this->hd_balancer.stop_timer(); + + // copy tep from device to host + + this->_tep.update_host(this->_max_tep_size*4,false); +/* + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_tep[4*i]); + printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Calculate the multipole real-space term, returning tep // --------------------------------------------------------------------------- @@ -290,13 +380,14 @@ int HippoT::multipole_real(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); - this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &this->_tep, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &this->_aewald, &this->_felec, - &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); return GX; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index f9020cf9a6..56da15f8aa 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -15,7 +15,8 @@ #if defined(NV_KERNEL) || defined(USE_HIP) #include -#include "lal_aux_fun1.h" +#include "lal_hippo_extra.h" +//#include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int #endif @@ -404,6 +405,318 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + repulsion = Pauli repulsion interactions + adapted from Tinker erepel1b() routine +------------------------------------------------------------------------- */ + +__kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict sp_nonpolar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global acctyp4 *restrict tep, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + const int t_per_atom, const numtyp aewald, + const numtyp off2, const numtyp cut2, + const numtyp c0, const numtyp c1, const numtyp c2, + const numtyp c3, const numtyp c4, const numtyp c5) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int l=0; l<6; l++) virial[l]=(acctyp)0; + } + + acctyp4 tq; + tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; + + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + if (iioff2) continue; + + const numtyp4 pol1j = polar1[j]; + numtyp ck = pol1j.x; // rpole[j][0]; + numtyp dkx = pol1j.y; // rpole[j][1]; + numtyp dky = pol1j.z; // rpole[j][2]; + numtyp dkz = pol1j.w; // rpole[j][3]; + const numtyp4 pol2j = polar2[j]; + numtyp qkxx = pol2j.x; // rpole[j][4]; + numtyp qkxy = pol2j.y; // rpole[j][5]; + numtyp qkxz = pol2j.z; // rpole[j][6]; + numtyp qkyy = pol2j.w; // rpole[j][8]; + const numtyp4 pol3j = polar3[j]; + numtyp qkyz = pol3j.x; // rpole[j][9]; + numtyp qkzz = pol3j.y; // rpole[j][12]; + int jtype = pol3j.z; // amtype[j]; + + numtyp sizk = coeff[jtype].x; // sizpr[jtype]; + numtyp dmpk = coeff[jtype].y; // dmppr[jtype]; + numtyp valk = coeff[jtype].z; // elepr[jtype]; + + const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)]; + numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)]; + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + numtyp dik = dix*dkx + diy*dky + diz*dkz; + numtyp qik = qix*qkx + qiy*qky + qiz*qkz; + numtyp diqk = dix*qkx + diy*qky + diz*qkz; + numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + qixx*qkxx + qiyy*qkyy + qizz*qkzz; + + // additional intermediates involving moments and distance + + numtyp dirx = diy*zr - diz*yr; + numtyp diry = diz*xr - dix*zr; + numtyp dirz = dix*yr - diy*xr; + numtyp dkrx = dky*zr - dkz*yr; + numtyp dkry = dkz*xr - dkx*zr; + numtyp dkrz = dkx*yr - dky*xr; + numtyp dikx = diy*dkz - diz*dky; + numtyp diky = diz*dkx - dix*dkz; + numtyp dikz = dix*dky - diy*dkx; + numtyp qirx = qiz*yr - qiy*zr; + numtyp qiry = qix*zr - qiz*xr; + numtyp qirz = qiy*xr - qix*yr; + numtyp qkrx = qkz*yr - qky*zr; + numtyp qkry = qkx*zr - qkz*xr; + numtyp qkrz = qky*xr - qkx*yr; + numtyp qikx = qky*qiz - qkz*qiy; + numtyp qiky = qkz*qix - qkx*qiz; + numtyp qikz = qkx*qiy - qky*qix; + numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz; + numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz; + numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz; + numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz; + numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz; + numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz; + numtyp qikrx = qizk*yr - qiyk*zr; + numtyp qikry = qixk*zr - qizk*xr; + numtyp qikrz = qiyk*xr - qixk*yr; + numtyp qkirx = qkzi*yr - qkyi*zr; + numtyp qkiry = qkxi*zr - qkzi*xr; + numtyp qkirz = qkyi*xr - qkxi*yr; + numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz; + numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz; + numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz; + numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz; + numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz; + numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz; + numtyp diqkrx = diqkz*yr - diqky*zr; + numtyp diqkry = diqkx*zr - diqkz*xr; + numtyp diqkrz = diqky*xr - diqkx*yr; + numtyp dkqirx = dkqiz*yr - dkqiy*zr; + numtyp dkqiry = dkqix*zr - dkqiz*xr; + numtyp dkqirz = dkqiy*xr - dkqix*yr; + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); + + // get reciprocal distance terms for this interaction + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + numtyp rr9 = (numtyp)7.0 * rr7 * r2inv; + numtyp rr11 = (numtyp)9.0 * rr9 * r2inv; + + // get damping coefficients for the Pauli repulsion energy + numtyp dmpik[11]; + damprep(r,r2,rr1,rr3,rr5,rr7,rr9,rr11,11,dmpi,dmpk,dmpik); + + // calculate intermediate terms needed for the energy + + numtyp term1 = vali*valk; + numtyp term2 = valk*dir - vali*dkr + dik; + numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); + numtyp term4 = dir*qkr - dkr*qir - 4.0*qik; + numtyp term5 = qir*qkr; + numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + + term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8]; + + // compute the Pauli repulsion energy for this interaction + + numtyp sizik = sizi * sizk * factor_repel; + numtyp e = sizik * eterm * rr1; + + // calculate intermediate terms for force and torque + + numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + + term4*dmpik[8] + term5*dmpik[10]; + term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6]; + term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6]; + term3 = (numtyp)2.0 * dmpik[4]; + term4 = (numtyp)2.0 * (-valk*dmpik[4] + dkr*dmpik[6] - qkr*dmpik[8]); + term5 = (numtyp)2.0 * (-vali*dmpik[4] - dir*dmpik[6] - qir*dmpik[8]); + numtyp term6 = (numtyp)4.0 * dmpik[6]; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + frcx = frcx*rr1 + eterm*rr3*xr; + frcy = frcy*rr1 + eterm*rr3*yr; + frcz = frcz*rr1 + eterm*rr3*zr; + + // compute the torque components for this interaction + + numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + ttmix = sizik * ttmix * rr1; + ttmiy = sizik * ttmiy * rr1; + ttmiz = sizik * ttmiz * rr1; + + // use energy switching if near the cutoff distance + + if (r2 > cut2) { + numtyp r3 = r2 * r; + numtyp r4 = r2 * r2; + numtyp r5 = r2 * r3; + numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0; + numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 + + (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1; + dtaper *= e * rr1; + e *= taper; + frcx = frcx*taper - dtaper*xr; + frcy = frcy*taper - dtaper*yr; + frcz = frcz*taper - dtaper*zr; + ttmix *= taper; + ttmiy *= taper; + ttmiz *= taper; + } + + energy += e; + + // increment force-based gradient and torque on atom I + + f.x += frcx; + f.y += frcy; + f.z += frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + // increment the internal virial tensor components + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vxy; + virial[4] += vxz; + virial[5] += vyz; + } + } // nbor + + } // ii { const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -65,6 +66,18 @@ class Hippo : public BaseAmoeba { const double aewald, const double off2_disp, double *charge, double *boxlo, double *prd); + /// Compute multipole real-space with device neighboring + virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double felec, const double off2_mpole, double *charge, + double *boxlo, double *prd, void **tep_ptr); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -105,6 +118,8 @@ class Hippo : public BaseAmoeba { UCL_Kernel k_dispersion; + UCL_Vector _pval; + protected: bool _allocated; int dispersion_real(const int eflag, const int vflag); diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index b9e31e7b20..fa09e7bce4 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -38,6 +38,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -73,7 +74,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - host_csix, host_adisp, nlocal, nall, max_nbors, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, polar_dscale, polar_uscale); @@ -97,7 +99,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - host_csix, host_adisp, nlocal, nall, max_nbors, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, polar_dscale, polar_uscale); diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h new file mode 100644 index 0000000000..890ce51121 --- /dev/null +++ b/lib/gpu/lal_hippo_extra.h @@ -0,0 +1,326 @@ +/// ************************************************************************** +// hippo_extra.h +// ------------------- +// Trung Dac Nguyen +// +// Device code for hippo math routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndactrung@gmail.com +// ***************************************************************************/* + +#ifndef LAL_HIPPO_EXTRA_H +#define LAL_HIPPO_EXTRA_H + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#else +#endif + +#define MY_PI2 (numtyp)1.57079632679489661923 +#define MY_PI4 (numtyp)0.78539816339744830962 + +/* ---------------------------------------------------------------------- + damprep generates coefficients for the Pauli repulsion + damping function for powers of the interatomic distance + + literature reference: + + J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An + Anisotropic, Atomic Multipole Model", Journal of Chemical Physics, + 150, 084104 (2019) +------------------------------------------------------------------------- */ + +ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, + const numtyp rr3, const numtyp rr5, const numtyp rr7, + const numtyp rr9, const numtyp rr11, const int rorder, + const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11]) +{ + numtyp r3,r4; + numtyp r5,r6,r7,r8; + numtyp s,ds,d2s; + numtyp d3s,d4s,d5s; + numtyp dmpi2,dmpk2; + numtyp dmpi22,dmpi23; + numtyp dmpi24,dmpi25; + numtyp dmpi26,dmpi27; + numtyp dmpk22,dmpk23; + numtyp dmpk24,dmpk25; + numtyp dmpk26; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp pre,term,tmp; + + // compute tolerance value for damping exponents + + eps = (numtyp)0.001; + diff = dmpi-dmpk; + if (diff < (numtyp)0) diff = -diff; + + // treat the case where alpha damping exponents are equal + + if (diff < eps) { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + r6 = r5 * r; + r7 = r6 * r; + dmpi2 = (numtyp)0.5 * dmpi; + dampi = dmpi2 * r; + expi = ucl_exp(-dampi); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpi26 = dmpi25 * dmpi2; + pre = (numtyp)128.0; + s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi; + + ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0; + d2s = dmpi24 * expi * r5 / 9.0; + d3s = dmpi25 * expi * r6 / 45.0; + d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0; + if (rorder >= 11) { + r8 = r7 * r; + dmpi27 = dmpi2 * dmpi26; + d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0; + } + + // treat the case where alpha damping exponents are unequal + + } else { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + dmpi2 = 0.5 * dmpi; + dmpk2 = 0.5 * dmpk; + dampi = dmpi2 * r; + dampk = dmpk2 * r; + expi = exp(-dampi); + expk = exp(-dampk); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpk22 = dmpk2 * dmpk2; + dmpk23 = dmpk22 * dmpk2; + dmpk24 = dmpk23 * dmpk2; + dmpk25 = dmpk24 * dmpk2; + term = dmpi22 - dmpk22; + pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0); + tmp = 4.0 * dmpi2 * dmpk2 / term; + s = (dampi-tmp)*expk + (dampk+tmp)*expi; + + ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - + (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + + (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + + 4.0*dmpi2*dmpk2/term) * expi; + d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - + (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - + 4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + + (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + + 4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi; + d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - + (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - + (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + + (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + + (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + + 4.0*dmpi2*dmpk2/term) * expi; + + if (rorder >= 11) { + r6 = r5 * r; + dmpi26 = dmpi25 * dmpi2; + dmpk26 = dmpk25 * dmpk2; + d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + + dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - + (4.0/945.0)*dmpi2*dmpk26*r5/term - + (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - + (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + + dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + + (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + + (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; + } + } + + // convert partial derivatives into full derivatives + + s = s * rr1; + ds = ds * rr3; + d2s = d2s * rr5; + d3s = d3s * rr7; + d4s = d4s * rr9; + d5s = d5s * rr11; + dmpik[0] = 0.5 * pre * s * s; + dmpik[2] = pre * s * ds; + dmpik[4] = pre * (s*d2s + ds*ds); + dmpik[6] = pre * (s*d3s + 3.0*ds*d2s); + dmpik[8] = pre * (s*d4s + 4.0*ds*d3s + 3.0*d2s*d2s); + if (rorder >= 11) dmpik[10] = pre * (s*d5s + 5.0*ds*d4s + 10.0*d2s*d3s); +} + +/* ---------------------------------------------------------------------- + damppole generates coefficients for the charge penetration + damping function for powers of the interatomic distance + + literature references: + + L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the + Effective Fragment Potential Method: Theory and Application to + the Benzene Dimer", Journal of Computational Chemistry, 28, + 276-291 (2007) [Gordon f1 and f2 models] + + J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and + J. W. Ponder, "An Optimized Charge Penetration Model for Use with + the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19, + 276-291 (2017) +------------------------------------------------------------------------- */ + +ucl_inline void damppole(const numtyp r, const int rorder, + const numtyp alphai, const numtyp alphak, + numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11]) +{ + numtyp termi,termk; + numtyp termi2,termk2; + numtyp alphai2,alphak2; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampi3; + numtyp dampi4,dampi5; + numtyp dampi6,dampi7; + numtyp dampi8; + numtyp dampk2,dampk3; + numtyp dampk4,dampk5; + numtyp dampk6; + + // compute tolerance and exponential damping factors + + eps = 0.001; + diff = fabs(alphai-alphak); + dampi = alphai * r; + dampk = alphak * r; + expi = exp(-dampi); + expk = exp(-dampk); + + // core-valence charge penetration damping for Gordon f1 + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + dampi4 = dampi2 * dampi2; + dampi5 = dampi2 * dampi3; + dmpi[0] = 1.0 - (1.0 + 0.5*dampi)*expi; + dmpi[2] = 1.0 - (1.0 + dampi + 0.5*dampi2)*expi; + dmpi[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi; + dmpi[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + dampi4/30.0)*expi; + dmpi[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + 4.0*dampi4/105.0 + dampi5/210.0)*expi; + if (diff < eps) { + dmpk[0] = dmpi[0]; + dmpk[2] = dmpi[2]; + dmpk[4] = dmpi[4]; + dmpk[6] = dmpi[6]; + dmpk[8] = dmpi[8]; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + dampk4 = dampk2 * dampk2; + dampk5 = dampk2 * dampk3; + dmpk[0] = 1.0 - (1.0 + 0.5*dampk)*expk; + dmpk[2] = 1.0 - (1.0 + dampk + 0.5*dampk2)*expk; + dmpk[4] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk; + dmpk[6] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk; + dmpk[8] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + + 4.0*dampk4/105.0 + dampk5/210.0)*expk; + } + + // valence-valence charge penetration damping for Gordon f1 + + if (diff < eps) { + dampi6 = dampi3 * dampi3; + dampi7 = dampi3 * dampi4; + dmpik[0] = 1.0 - (1.0 + 11.0*dampi/16.0 + 3.0*dampi2/16.0 + + dampi3/48.0)*expi; + dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + + 7.0*dampi3/48.0 + dampi4/48.0)*expi; + dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/144.0)*expi; + dmpik[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/120.0 + dampi6/720.0)*expi; + dmpik[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + + dampi7/5040.0)*expi; + if (rorder >= 11) { + dampi8 = dampi4 * dampi4; + dmpik[10] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + + dampi7/5040.0 + dampi8/45360.0)*expi; + } + + } else { + alphai2 = alphai * alphai; + alphak2 = alphak * alphak; + termi = alphak2 / (alphak2-alphai2); + termk = alphai2 / (alphai2-alphak2); + termi2 = termi * termi; + termk2 = termk * termk; + dmpik[0] = 1.0 - termi2*(1.0 + 2.0*termk + 0.5*dampi)*expi - + termk2*(1.0 + 2.0*termi + 0.5*dampk)*expk; + dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi - + termk2*(1.0+dampk+0.5*dampk2)*expk - + 2.0*termi2*termk*(1.0+dampi)*expi - + 2.0*termk2*termi*(1.0+dampk)*expk; + dmpik[4] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + dampi2/3.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + dampk2/3.0)*expk; + dmpik[6] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + + dampi3/6.0 + dampi4/30.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + 2.0*dampi2/5.0 + dampi3/15.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + 2.0*dampk2/5.0 + dampk3/15.0)*expk; + dmpik[8] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + 4.0*dampi4/105.0 + dampi5/210.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + + 4.0*dampk4/105.0 + dampk5/210.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + 3.0*dampi2/7.0 + + 2.0*dampi3/21.0 + dampi4/105.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + 3.0*dampk2/7.0 + + 2.0*dampk3/21.0 + dampk4/105.0)*expk; + + if (rorder >= 11) { + dampi6 = dampi3 * dampi3; + dampk6 = dampk3 * dampk3; + dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + 5.0*dampi4/126.0 + 2.0*dampi5/315.0 + + dampi6/1890.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + + 2.0*dampk5/315.0 + dampk6/1890.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + + dampi4/63.0 + dampi5/945.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + + dampk4/63.0 + dampk5/945.0)*expk; + } + } +} + + + +#endif diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index a6e7b9edc6..91465abb82 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -59,6 +59,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -191,10 +192,10 @@ void PairHippoGPU::init_style() pdamp, thole, dirdamp, amtype2class, special_hal, special_repel, special_disp, special_mpole, special_polar_wscale, special_polar_piscale, - special_polar_pscale, csix, adisp, atom->nlocal, - atom->nlocal+atom->nghost, mnf, maxspecial, - maxspecial15, cell_size, gpu_mode, screen, - polar_dscale, polar_uscale, tq_size); + special_polar_pscale, csix, adisp, pcore, palpha, + atom->nlocal, atom->nlocal+atom->nghost, mnf, + maxspecial, maxspecial15, cell_size, gpu_mode, + screen, polar_dscale, polar_uscale, tq_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE)