diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 0f87104832..77bbebbb9a 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // leave the answers (forces, energies and virial) on the device, // only copy them back in the last kernel (this one, or polar_real once done) - //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //this->device->add_ans_object(this->ans); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); @@ -444,7 +444,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 1f9c14d4da..95f18db7d2 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1032,6 +1032,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass]; numtyp valk = polar6[j].x; + if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole); // intermediates involving moments and separation distance numtyp dir = dix*xr + diy*yr + diz*zr; @@ -1772,7 +1773,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - + const numtyp4 pol1j = polar1[j]; numtyp ck = polar1[j].x; // rpole[j][0]; numtyp dkx = polar1[j].y; // rpole[j][1]; @@ -1800,6 +1801,11 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)]; + // NOTE: for in.water_box/water_hexamer.hippo: there exist wscale = 0.2 + //if (factor_wscale < (numtyp)1.0) continue; //factor_wscale = (numtyp)0; + + //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, r, factor_wscale); + if (igroup == jgroup) { factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)]; factor_uscale = polar_uscale; @@ -1910,7 +1916,8 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; dufld[5] += zr*tiz5 + zr*zr*tuir; - + + // get the field gradient for direct polarization force numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i; @@ -1929,7 +1936,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, term1k = rr3k - rr5k*xr*xr; term2k = (numtyp)2.0*rr5k*xr; term3k = rr7k*xr*xr - rr5k; - term4k = 2.0*rr5k; + term4k = (numtyp)2.0*rr5k; term5k = (numtyp)5.0*rr7k*xr; term6k = rr9k*xr*xr; tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i - @@ -2046,7 +2053,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp frcx = (numtyp)-2.0 * depx; numtyp frcy = (numtyp)-2.0 * depy; numtyp frcz = (numtyp)-2.0 * depz; - + // get the dEp/dR terms used for direct polarization force // poltyp == MUTUAL && hippo // tixx and tkxx @@ -2159,7 +2166,7 @@ __kernel void k_special15(__global int * dev_nbor, int which = sj >> SBBITS & 3; int j = sj & NEIGHMASK; tagint jtag = tag[j]; - + if (i == 0 && j < 20) printf("GPU: j = %d; jtag = %d\n", j, jtag); if (!which) { int offset=ii; for (int k=0; k= 11) { r8 = r7 * r; dmpi27 = dmpi2 * dmpi26; - d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0; + d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0; } // treat the case where alpha damping exponents are unequal @@ -97,12 +97,12 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, r3 = r2 * r; r4 = r3 * r; r5 = r4 * r; - dmpi2 = 0.5 * dmpi; - dmpk2 = 0.5 * dmpk; + dmpi2 = (numtyp)0.5 * dmpi; + dmpk2 = (numtyp)0.5 * dmpk; dampi = dmpi2 * r; dampk = dmpk2 * r; - expi = exp(-dampi); - expk = exp(-dampk); + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); dmpi22 = dmpi2 * dmpi2; dmpi23 = dmpi22 * dmpi2; dmpi24 = dmpi23 * dmpi2; @@ -112,34 +112,34 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, dmpk24 = dmpk23 * dmpk2; dmpk25 = dmpk24 * dmpk2; term = dmpi22 - dmpk22; - pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0); - tmp = 4.0 * dmpi2 * dmpk2 / term; + pre = (numtyp)8192.0 * dmpi23 * dmpk23 / ucl_powr(term,(numtyp)4.0); + tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term; s = (dampi-tmp)*expk + (dampk+tmp)*expi; - ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; - d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - - (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + - (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + - 4.0*dmpi2*dmpk2/term) * expi; - d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - - (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - - 4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + - (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + - (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + - 4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi; - d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + - dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - - (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - - (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + - dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + - (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + - (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + + ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - + ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + + ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - + (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - + (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + + ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi; + d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - + ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - + ((numtyp)12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + + (4.0/105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/21.0)*dmpi24*dmpk2*r3/term + + (12.0/7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; if (rorder >= 11) { @@ -217,8 +217,8 @@ ucl_inline void damppole(const numtyp r, const int rorder, diff = fabs(alphai-alphak); dampi = alphai * r; dampk = alphak * r; - expi = exp(-dampi); - expk = exp(-dampk); + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); // core-valence charge penetration damping for Gordon f1 @@ -308,15 +308,15 @@ ucl_inline void damppole(const numtyp r, const int rorder, if (rorder >= 11) { dampi6 = dampi3 * dampi3; dampk6 = dampk3 * dampk3; - dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - 5.0*dampi4/126.0 + 2.0*dampi5/315.0 + - dampi6/1890.0)*expi - - termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + - 2.0*dampk5/315.0 + dampk6/1890.0)*expk - - 2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + - dampi4/63.0 + dampi5/945.0)*expi - - 2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + - dampk4/63.0 + dampk5/945.0)*expk; + dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + + dampi6/(numtyp)1890.0)*expi - + termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + 5.0*dampk4/(numtyp)126.0 + + (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + + dampi4/63.0 + dampi5/(numtyp)945.0)*expi - + (numtyp)2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + + dampk4/63.0 + dampk5/(numtyp)945.0)*expk; } } } diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index e6b3e6ef70..4fa8a5d892 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -366,6 +366,7 @@ void PairAmoeba::polar_real() yr = x[j][1] - yi; zr = x[j][2] - zi; r2 = xr*xr + yr*yr + zr*zr; + if (r2 > off2) continue; jtype = amtype[j]; @@ -393,7 +394,7 @@ void PairAmoeba::polar_real() factor_uscale = 1.0; } } - + //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale); r = sqrt(r2); ck = rpole[j][0]; dkx = rpole[j][1]; @@ -567,7 +568,7 @@ void PairAmoeba::polar_real() ufld[j][0] += tkx3 + xr*tukr; ufld[j][1] += tky3 + yr*tukr; ufld[j][2] += tkz3 + zr*tukr; - + // get induced dipole field gradient used for quadrupole torques if (amoeba) { @@ -579,7 +580,6 @@ void PairAmoeba::polar_real() tkz5 = 2.0 * (psr5*uiz+dsr5*uizp); tuir = -psr7*ukr - dsr7*ukrp; tukr = -psr7*uir - dsr7*uirp; - // reached here... } else if (hippo) { tix5 = 4.0 * (rr5i*ukx); tiy5 = 4.0 * (rr5i*uky); @@ -597,7 +597,6 @@ void PairAmoeba::polar_real() dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir; dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir; dufld[i][5] += zr*tiz5 + zr*zr*tuir; - dufld[j][0] -= xr*tkx5 + xr*xr*tukr; dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr; dufld[j][2] -= yr*tky5 + yr*yr*tukr; @@ -668,7 +667,7 @@ void PairAmoeba::polar_real() frcx = depx; frcy = depy; frcz = depz; - + // get the dEp/dR terms used for direct polarization force term1 = bn[2] - psc3*rr5; @@ -855,6 +854,7 @@ void PairAmoeba::polar_real() frcx = -2.0 * depx; frcy = -2.0 * depy; frcz = -2.0 * depz; + } // get the dtau/dr terms used for mutual polarization force @@ -1199,6 +1199,8 @@ void PairAmoeba::polar_real() torque2force(i,tep,fix,fiy,fiz,fpolar); + //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, tep[0], tep[1], tep[2]); + iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 5157739f0e..1ff35e7ce1 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -242,6 +242,47 @@ void PairAmoeba::compute(int eflag, int vflag) time_induce = time_polar = time_qxfer = 0.0; } + { // DEBUGGING + double **x = atom->x; + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + int i,ii,j,jj; + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + double xi = x[i][0]; + double yi = x[i][1]; + double zi = x[i][2]; + + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + double factor_mpole = special_mpole[sbmask15(j)]; + j &= NEIGHMASK15; + + double xr = x[j][0] - xi; + double yr = x[j][1] - yi; + double zr = x[j][2] - zi; + double r2 = xr*xr + yr*yr + zr*zr; + if (r2 > off2) continue; + double r = sqrt(r2); + if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole); + } + } + + + } // DEBUGGING + double evdwl; evdwl = 0.0; @@ -973,8 +1014,8 @@ void PairAmoeba::init_style() int irequest = neighbor->request(this,instance_me); // for DEBUGGING with GPU - //neighbor->requests[irequest]->half = 0; - //neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; // open debug output files // names are hard-coded diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 23395e5fe3..4da0056029 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; gpu_udirect2b_ready = false; gpu_umutual2b_ready = false; - gpu_polar_real_ready = true; + gpu_polar_real_ready = false; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -1137,6 +1137,8 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, _tq[2] = tq_ptr[4*i+2]; torque2force(i,_tq,fix,fiy,fiz,force_comp); + //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, _tq[0], _tq[1], _tq[2]); + iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i];