Fixed bugs in the polar real kernel in hippo, getting closer..

This commit is contained in:
Trung Nguyen
2021-09-26 09:11:09 -05:00
parent 5193dcf8c5
commit 7437c98628
5 changed files with 169 additions and 117 deletions

View File

@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
// leave the answers (forces, energies and virial) on the device,
// only copy them back in the last kernel (this one, or polar_real once done)
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
this->device->add_ans_object(this->ans);
//this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
//this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
@ -568,6 +568,94 @@ int HippoT::umutual2b(const int eflag, const int vflag) {
return GX;
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute polar real-space
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int** HippoT::compute_polar_real(const int ago, const int inum_full,
const int nall, double **host_x,
int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole,
double **host_uind, double **host_uinp,
double *host_pval, double *sublo, double *subhi,
tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag_in, const bool vflag_in,
const bool eatom, const bool vatom,
int &host_start, int **ilist, int **jnum,
const double cpu_time, bool &success,
const double aewald, const double felec,
const double off2_polar, double *host_q,
double *boxlo, double *prd, void **tep_ptr) {
this->acc_timers();
int eflag, vflag;
if (eatom) eflag=2;
else if (eflag_in) eflag=1;
else eflag=0;
if (vatom) vflag=2;
else if (vflag_in) vflag=1;
else vflag=0;
#ifdef LAL_NO_BLOCK_REDUCE
if (eflag) eflag=2;
if (vflag) vflag=2;
#endif
this->set_kernel(eflag,vflag);
// reallocate per-atom arrays, transfer data from the host
// and build the neighbor lists if needed
// NOTE:
// For now we invoke precompute() again here,
// to be able to turn on/off the udirect2b kernel (which comes before this)
// Once all the kernels are ready, precompute() is needed only once
// in the first kernel in a time step.
// We only need to cast uind and uinp from host to device here
// if the neighbor lists are rebuilt and other per-atom arrays
// (x, type, amtype, amgroup, rpole) are ready on the device.
int** firstneigh = nullptr;
firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole,
host_uind, host_uinp, host_pval, sublo, subhi, tag,
nspecial, special, nspecial15, special15,
eflag_in, vflag_in, eatom, vatom,
host_start, ilist, jnum, cpu_time,
success, host_q, boxlo, prd);
// ------------------- Resize _tep array ------------------------
if (inum_full>this->_max_tep_size) {
this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
this->_tep.resize(this->_max_tep_size*4);
}
*tep_ptr=this->_tep.host.begin();
this->_off2_polar = off2_polar;
this->_felec = felec;
this->_aewald = aewald;
const int red_blocks=polar_real(eflag,vflag);
// only copy answers (forces, energies and virial) back from the device
// in the last kernel (which is polar_real here)
this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
this->device->add_ans_object(this->ans);
this->hd_balancer.stop_timer();
// copy tep from device to host
this->_tep.update_host(this->_max_tep_size*4,false);
/*
printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
for (int i = 0; i < 10; i++) {
numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
}
*/
return firstneigh; // nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate the polar real-space term, returning tep
// ---------------------------------------------------------------------------

View File

@ -1873,25 +1873,25 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
numtyp dmpi[9],dmpk[9];
numtyp dmpik[9];
damppole(r,9,alphai,alphak,dmpi,dmpk,dmpik);
numtyp rr3core = bn[1] - (1.0-factor_dscale)*rr3;
numtyp rr5core = bn[2] - (1.0-factor_dscale)*rr5;
numtyp rr3i = bn[1] - (1.0-factor_dscale*dmpi[2])*rr3;
numtyp rr5i = bn[2] - (1.0-factor_dscale*dmpi[4])*rr5;
numtyp rr7i = bn[3] - (1.0-factor_dscale*dmpi[6])*rr7;
numtyp rr9i = bn[4] - (1.0-factor_dscale*dmpi[8])*rr9;
numtyp rr3k = bn[1] - (1.0-factor_dscale*dmpk[2])*rr3;
numtyp rr5k = bn[2] - (1.0-factor_dscale*dmpk[4])*rr5;
numtyp rr7k = bn[3] - (1.0-factor_dscale*dmpk[6])*rr7;
numtyp rr9k = bn[4] - (1.0-factor_dscale*dmpk[8])*rr9;
numtyp rr5ik = bn[2] - (1.0-factor_wscale*dmpik[4])*rr5;
numtyp rr7ik = bn[3] - (1.0-factor_wscale*dmpik[6])*rr7;
numtyp rr3core = bn[1] - ((numtyp)1.0-factor_dscale)*rr3;
numtyp rr5core = bn[2] - ((numtyp)1.0-factor_dscale)*rr5;
numtyp rr3i = bn[1] - ((numtyp)1.0-factor_dscale*dmpi[2])*rr3;
numtyp rr5i = bn[2] - ((numtyp)1.0-factor_dscale*dmpi[4])*rr5;
numtyp rr7i = bn[3] - ((numtyp)1.0-factor_dscale*dmpi[6])*rr7;
numtyp rr9i = bn[4] - ((numtyp)1.0-factor_dscale*dmpi[8])*rr9;
numtyp rr3k = bn[1] - ((numtyp)1.0-factor_dscale*dmpk[2])*rr3;
numtyp rr5k = bn[2] - ((numtyp)1.0-factor_dscale*dmpk[4])*rr5;
numtyp rr7k = bn[3] - ((numtyp)1.0-factor_dscale*dmpk[6])*rr7;
numtyp rr9k = bn[4] - ((numtyp)1.0-factor_dscale*dmpk[8])*rr9;
numtyp rr5ik = bn[2] - ((numtyp)1.0-factor_wscale*dmpik[4])*rr5;
numtyp rr7ik = bn[3] - ((numtyp)1.0-factor_wscale*dmpik[6])*rr7;
// get the induced dipole field used for dipole torques
numtyp tix3 = 2.0*rr3i*ukx;
numtyp tiy3 = 2.0*rr3i*uky;
numtyp tiz3 = 2.0*rr3i*ukz;
numtyp tuir = -2.0*rr5i*ukr;
numtyp tix3 = (numtyp)2.0*rr3i*ukx;
numtyp tiy3 = (numtyp)2.0*rr3i*uky;
numtyp tiz3 = (numtyp)2.0*rr3i*ukz;
numtyp tuir = (numtyp)-2.0*rr5i*ukr;
ufld[0] += tix3 + xr*tuir;
ufld[1] += tiy3 + yr*tuir;
@ -1899,10 +1899,10 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
// get induced dipole field gradient used for quadrupole torques
numtyp tix5 = 4.0 * (rr5i*ukx);
numtyp tiy5 = 4.0 * (rr5i*uky);
numtyp tiz5 = 4.0 * (rr5i*ukz);
tuir = -2.0*rr7i*ukr;
numtyp tix5 = (numtyp)4.0 * (rr5i*ukx);
numtyp tiy5 = (numtyp)4.0 * (rr5i*uky);
numtyp tiz5 = (numtyp)4.0 * (rr5i*ukz);
tuir = (numtyp)-2.0*rr7i*ukr;
dufld[0] += xr*tix5 + xr*xr*tuir;
dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
@ -1911,7 +1911,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
dufld[5] += zr*tiz5 + zr*zr*tuir;
// get the dEd/dR terms used for direct polarization force
// get the field gradient for direct polarization force
numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k;
@ -1921,16 +1921,16 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
term1i = rr3i - rr5i*xr*xr;
term1core = rr3core - rr5core*xr*xr;
term2i = 2.0*rr5i*xr ;
term2i = (numtyp)2.0*rr5i*xr ;
term3i = rr7i*xr*xr - rr5i;
term4i = 2.0*rr5i;
term5i = 5.0*rr7i*xr;
term4i = (numtyp)2.0*rr5i;
term5i = (numtyp)5.0*rr7i*xr;
term6i = rr9i*xr*xr;
term1k = rr3k - rr5k*xr*xr;
term2k = 2.0*rr5k*xr;
term2k = (numtyp)2.0*rr5k*xr;
term3k = rr7k*xr*xr - rr5k;
term4k = 2.0*rr5k;
term5k = 5.0*rr7k*xr;
term5k = (numtyp)5.0*rr7k*xr;
term6k = rr9k*xr*xr;
tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i -
qixx*term4i + qix*term5i - qir*term6i + (qiy*yr+qiz*zr)*rr7i;
@ -1939,16 +1939,16 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
term1i = rr3i - rr5i*yr*yr;
term1core = rr3core - rr5core*yr*yr;
term2i = 2.0*rr5i*yr;
term2i = (numtyp)2.0*rr5i*yr;
term3i = rr7i*yr*yr - rr5i;
term4i = 2.0*rr5i;
term5i = 5.0*rr7i*yr;
term4i = (numtyp)2.0*rr5i;
term5i = (numtyp)5.0*rr7i*yr;
term6i = rr9i*yr*yr;
term1k = rr3k - rr5k*yr*yr;
term2k = 2.0*rr5k*yr;
term2k = (numtyp)2.0*rr5k*yr;
term3k = rr7k*yr*yr - rr5k;
term4k = 2.0*rr5k;
term5k = 5.0*rr7k*yr;
term4k = (numtyp)2.0*rr5k;
term5k = (numtyp)5.0*rr7k*yr;
term6k = rr9k*yr*yr;
tiyy = vali*term1i + corei*term1core + diy*term2i - dir*term3i -
qiyy*term4i + qiy*term5i - qir*term6i + (qix*xr+qiz*zr)*rr7i;
@ -1957,16 +1957,16 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
term1i = rr3i - rr5i*zr*zr;
term1core = rr3core - rr5core*zr*zr;
term2i = 2.0*rr5i*zr;
term2i = (numtyp)2.0*rr5i*zr;
term3i = rr7i*zr*zr - rr5i;
term4i = 2.0*rr5i;
term5i = 5.0*rr7i*zr;
term4i = (numtyp)2.0*rr5i;
term5i = (numtyp)5.0*rr7i*zr;
term6i = rr9i*zr*zr;
term1k = rr3k - rr5k*zr*zr;
term2k = 2.0*rr5k*zr;
term2k = (numtyp)2.0*rr5k*zr;
term3k = rr7k*zr*zr - rr5k;
term4k = 2.0*rr5k;
term5k = 5.0*rr7k*zr;
term4k = (numtyp)2.0*rr5k;
term5k = (numtyp)5.0*rr7k*zr;
term6k = rr9k*zr*zr;
tizz = vali*term1i + corei*term1core + diz*term2i - dir*term3i -
qizz*term4i + qiz*term5i - qir*term6i + (qix*xr+qiy*yr)*rr7i;
@ -1978,17 +1978,17 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
term1core = rr5core*xr*yr;
term3i = rr5i*yr;
term4i = yr * (rr7i*xr);
term5i = 2.0*rr5i;
term6i = 2.0*rr7i*xr;
term7i = 2.0*rr7i*yr;
term5i = (numtyp)2.0*rr5i;
term6i = (numtyp)2.0*rr7i*xr;
term7i = (numtyp)2.0*rr7i*yr;
term8i = yr*rr9i*xr;
term2k = rr5k*xr;
term1k = yr * term2k;
term3k = rr5k*yr;
term4k = yr * (rr7k*xr);
term5k = 2.0*rr5k;
term6k = 2.0*rr7k*xr;
term7k = 2.0*rr7k*yr;
term5k = (numtyp)2.0*rr5k;
term6k = (numtyp)2.0*rr7k*xr;
term7k = (numtyp)2.0*rr7k*yr;
term8k = yr*rr9k*xr;
tixy = -vali*term1i - corei*term1core + diy*term2i + dix*term3i -
dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i;
@ -2000,17 +2000,17 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
term1core = rr5core*xr*zr;
term3i = rr5i*zr;
term4i = zr * (rr7i*xr);
term5i = 2.0*rr5i;
term6i = 2.0*rr7i*xr;
term7i = 2.0*rr7i*zr;
term5i = (numtyp)2.0*rr5i;
term6i = (numtyp)2.0*rr7i*xr;
term7i = (numtyp)2.0*rr7i*zr;
term8i = zr*rr9i*xr;
term2k = rr5k*xr;
term1k = zr * term2k;
term3k = rr5k*zr;
term4k = zr * (rr7k*xr);
term5k = 2.0*rr5k;
term6k = 2.0*rr7k*xr;
term7k = 2.0*rr7k*zr;
term5k = (numtyp)2.0*rr5k;
term6k = (numtyp)2.0*rr7k*xr;
term7k = (numtyp)2.0*rr7k*zr;
term8k = zr*rr9k*xr;
tixz = -vali*term1i - corei*term1core + diz*term2i + dix*term3i -
dir*term4i - qixz*term5i + qiz*term6i + qix*term7i - qir*term8i;
@ -2022,17 +2022,17 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
term1core = rr5core*yr*zr;
term3i = rr5i*zr;
term4i = zr * (rr7i*yr);
term5i = 2.0*rr5i;
term6i = 2.0*rr7i*yr;
term7i = 2.0*rr7i*zr;
term5i = (numtyp)2.0*rr5i;
term6i = (numtyp)2.0*rr7i*yr;
term7i = (numtyp)2.0*rr7i*zr;
term8i = zr*rr9i*yr;
term2k = rr5k*yr;
term1k = zr * term2k;
term3k = rr5k*zr;
term4k = zr * (rr7k*yr);
term5k = 2.0*rr5k;
term6k = 2.0*rr7k*yr;
term7k = 2.0*rr7k*zr;
term5k = (numtyp)2.0*rr5k;
term6k = (numtyp)2.0*rr7k*yr;
term7k = (numtyp)2.0*rr7k*zr;
term8k = zr*rr9k*yr;
tiyz = -vali*term1i - corei*term1core + diz*term2i + diy*term3i -
dir*term4i - qiyz*term5i + qiz*term6i + qiy*term7i - qir*term8i;
@ -2043,14 +2043,14 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
numtyp depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
numtyp depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
numtyp frcx = -2.0 * depx;
numtyp frcy = -2.0 * depy;
numtyp frcz = -2.0 * depz;
numtyp frcx = (numtyp)-2.0 * depx;
numtyp frcy = (numtyp)-2.0 * depy;
numtyp frcz = (numtyp)-2.0 * depz;
// get the dEp/dR terms used for direct polarization force
// poltyp == MUTUAL && hippo
// tixx and tkxx
term1 = 2.0 * rr5ik;
term1 = (numtyp)2.0 * rr5ik;
term2 = term1*xr;
term3 = rr5ik - rr7ik*xr*xr;
tixx = uix*term2 + uir*term3;
@ -2095,55 +2095,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
frcy = frcy - depy;
frcz = frcz - depz;
// get the dtau/dr terms used for mutual polarization force
// poltyp == MUTUAL && hippo
term1 = bn[2] - usc3*rr5;
term2 = bn[3] - usc5*rr7;
term3 = usr5 + term1;
term4 = rr3 * factor_uscale;
term5 = -xr*term3 + rc3[0]*term4;
term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
tixx = uix*term5 + uir*term6;
tkxx = ukx*term5 + ukr*term6;
term5 = -yr*term3 + rc3[1]*term4;
term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
tiyy = uiy*term5 + uir*term6;
tkyy = uky*term5 + ukr*term6;
term5 = -zr*term3 + rc3[2]*term4;
term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
tizz = uiz*term5 + uir*term6;
tkzz = ukz*term5 + ukr*term6;
term4 = -usr5 * yr;
term5 = -xr*term1 + rr3*urc3[0];
term6 = xr*yr*term2 - rr5*yr*urc5[0];
tixy = uix*term4 + uiy*term5 + uir*term6;
tkxy = ukx*term4 + uky*term5 + ukr*term6;
term4 = -usr5 * zr;
term6 = xr*zr*term2 - rr5*zr*urc5[0];
tixz = uix*term4 + uiz*term5 + uir*term6;
tkxz = ukx*term4 + ukz*term5 + ukr*term6;
term5 = -yr*term1 + rr3*urc3[1];
term6 = yr*zr*term2 - rr5*zr*urc5[1];
tiyz = uiy*term4 + uiz*term5 + uir*term6;
tkyz = uky*term4 + ukz*term5 + ukr*term6;
depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
+ tkxx*uixp + tkxy*uiyp + tkxz*uizp;
depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
+ tkxy*uixp + tkyy*uiyp + tkyz*uizp;
depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
+ tkxz*uixp + tkyz*uiyp + tkzz*uizp;
frcx = frcx + depx;
frcy = frcy + depy;
frcz = frcz + depz;
f.x -= frcx;
f.y -= frcy;
f.z -= frcz;

View File

@ -90,6 +90,19 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
const double aewald, const double felec, const double off2_mpole, double *charge,
double *boxlo, double *prd, void **tep_ptr);
/// Compute polar real-space with device neighboring
virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
int *host_amgroup, double **host_rpole, double **host_uind,
double **host_uinp, double *host_pval, double *sublo, double *subhi,
tagint *tag, int **nspecial, tagint **special,
int *nspecial15, tagint **special15,
const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success,
const double aewald, const double felec, const double off2_polar,
double *charge, double *boxlo, double *prd, void **tep_ptr);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();

View File

@ -194,7 +194,7 @@ int** hippo_gpu_compute_polar_real(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp,
double *sublo, double *subhi, tagint *tag, int **nspecial,
double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
@ -202,7 +202,7 @@ int** hippo_gpu_compute_polar_real(const int ago, const int inum_full,
bool &success, const double aewald, const double felec, const double off2,
double *host_q, double *boxlo, double *prd, void **tep_ptr) {
return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
sublo, subhi, tag, nspecial, special, nspecial15, special15,
eflag, vflag, eatom, vatom, host_start, ilist, jnum,
cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);

View File

@ -108,7 +108,7 @@ int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall
int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall,
double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
double **host_rpole, double **host_uind, double **host_uinp,
double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int* nspecial15, tagint** special15,
const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
gpu_multipole_real_ready = true;
gpu_udirect2b_ready = false;
gpu_umutual2b_ready = false;
gpu_polar_real_ready = false;
gpu_polar_real_ready = true;
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
}
@ -1089,7 +1089,7 @@ void PairHippoGPU::polar_real()
firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
atom->type, amtype, amgroup,
rpole, uind, uinp, sublo, subhi,
rpole, uind, uinp, pval, sublo, subhi,
atom->tag, atom->nspecial, atom->special,
atom->nspecial15, atom->special15,
eflag, vflag, eflag_atom, vflag_atom,