diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 8d9af4706e..917166c423 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -140,7 +140,7 @@ void AmoebaT::clear() { coeff_amclass.clear(); sp_polar.clear(); sp_nonpolar.clear(); - + this->clear_atomic(); } @@ -169,7 +169,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { // Build the short neighbor list for the cutoff off2_mpole, // at this point mpole is the first kernel in a time step - + this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), @@ -194,7 +194,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { - int ainum=this->ans->inum(); + int ainum=this->ans->inum(); if (ainum == 0) return 0; @@ -216,7 +216,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->short_nbor_polar_avail = true; } - + this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 1deb3e3bb5..fdb959f3e2 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -492,7 +492,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; //int jtype=jx.w; - + // Compute r12 numtyp xr = jx.x - ix.x; numtyp yr = jx.y - ix.y; @@ -500,7 +500,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp r2 = xr*xr + yr*yr + zr*zr; //if (r2>off2) continue; - + numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; @@ -533,12 +533,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; numtyp qkr = qkx*xr + qky*yr + qkz*zr; - + numtyp dik = dix*dkx + diy*dky + diz*dkz; numtyp qik = qix*qkx + qiy*qky + qiz*qkz; numtyp diqk = dix*qkx + diy*qky + diz*qkz; numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; - numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + qixx*qkxx + qiyy*qkyy + qizz*qkzz; // additional intermediates involving moments and distance @@ -585,11 +585,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; - numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); - numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); - numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); // get reciprocal distance terms for this interaction @@ -650,20 +650,20 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, // compute the force components for this interaction - numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + term4*qix + term5*qkx + term6*(qixk+qkxi); - numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + term4*qiy + term5*qky + term6*(qiyk+qkyi); - numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + term4*qiz + term5*qkz + term6*(qizk+qkzi); // compute the torque components for this interaction - numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - term4*qirx - term6*(qikrx+qikx); - numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - term4*qiry - term6*(qikry+qiky); - numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - term4*qirz - term6*(qikrz+qikz); // increment force-based gradient and torque on first site @@ -691,12 +691,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -898,7 +898,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; - + scalek = factor_pscale; bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; @@ -918,7 +918,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1044,7 +1044,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } // find terms needed later to compute mutual polarization - // if (poltyp != DIRECT) + // if (poltyp != DIRECT) numtyp scale3 = (numtyp)1.0; numtyp scale5 = (numtyp)1.0; numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] @@ -1056,7 +1056,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, scale3 = (numtyp)1.0 - expdamp; scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); } - + } else { // damp == 0: ??? } @@ -1071,17 +1071,17 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, tdipdip[3] = -bcn[0] + bcn[1]*yr*yr; tdipdip[4] = bcn[1]*yr*zr; tdipdip[5] = -bcn[0] + bcn[1]*zr*zr; - //if (i==0 && j == 10) + //if (i==0 && j == 10) // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; - + fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; - + _fieldp[0] += fid[0]; _fieldp[1] += fid[1]; _fieldp[2] += fid[2]; @@ -1093,7 +1093,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); - + const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; numtyp dkx = pol1j.y; // rpole[j][1]; @@ -1383,7 +1383,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp tiy3 = psr3*uky + dsr3*ukyp; numtyp tiz3 = psr3*ukz + dsr3*ukzp; numtyp tuir = -psr5*ukr - dsr5*ukrp; - + ufld[0] += tix3 + xr*tuir; ufld[1] += tiy3 + yr*tuir; ufld[2] += tiz3 + zr*tuir; @@ -1394,14 +1394,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp); numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp); tuir = -psr7*ukr - dsr7*ukrp; - + dufld[0] += xr*tix5 + xr*xr*tuir; dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; dufld[2] += yr*tiy5 + yr*yr*tuir; dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; dufld[5] += zr*tiz5 + zr*zr*tuir; - + // get the dEd/dR terms used for direct polarization force term1 = bn[2] - dsc3*rr5; @@ -1473,7 +1473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp frcz = depz; // get the dEp/dR terms used for direct polarization force - + // tixx and tkxx term1 = bn[2] - psc3*rr5; term2 = bn[3] - psc5*rr7; @@ -1550,7 +1550,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, // get the dtau/dr terms used for mutual polarization force // poltyp == MUTUAL && amoeba - + term1 = bn[2] - usc3*rr5; term2 = bn[3] - usc5*rr7; term3 = usr5 + term1; @@ -1617,7 +1617,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // ii { UCL_D_Vec coeff_amtype; /// csix = coeff_amclass.x; adisp = coeff_amclass.y; UCL_D_Vec coeff_amclass; - /// Special polar values [0-4]: + /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, /// sp_polar.z = special_polar_piscale /// sp_polar.w = special_mpole UCL_D_Vec sp_polar; - /// Special nonpolar values [0-4]: + /// Special nonpolar values [0-4]: /// sp_nonpolar.x = special_hal /// sp_nonpolar.y special_repel /// sp_nonpolar.z = special_disp @@ -97,7 +97,7 @@ class Amoeba : public BaseAmoeba { int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); int polar_real(const int eflag, const int vflag); - + }; } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index c4fdb8c9e5..3728fbe85e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -106,7 +106,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _threads_per_atom); if (success!=0) return success; - + // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -121,7 +121,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _maxspecial=maxspecial; _maxspecial15=maxspecial15; - // allocate per-atom array tep + // allocate per-atom array tep int ef_nall=nlocal; //nall; if (ef_nall==0) @@ -250,7 +250,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, const double aewald, const double felec, + bool &success, const double aewald, const double felec, const double off2_polar, double *host_q, const int nlocal, double *boxlo, double *prd, void **tep_ptr) { acc_timers(); @@ -280,7 +280,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f dev_special15_t.clear(); dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } *tep_ptr=_tep.host.begin(); @@ -320,7 +320,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f _off2_polar = off2_polar; _felec = felec; const int red_blocks=polar_real(eflag,vflag); - + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -375,7 +375,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall dev_special15_t.clear(); dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } if (inum_full==0) { @@ -462,7 +462,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -509,7 +509,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -560,7 +560,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - + // ------------------- Resize _fieldp array ------------------------ if (inum_full>_max_fieldp_size) { @@ -698,7 +698,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -745,7 +745,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -809,7 +809,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, int idx = n+i*nstride; pextra[idx] = uinp[i][0]; pextra[idx+1] = uinp[i][1]; - pextra[idx+2] = uinp[i][2]; + pextra[idx+2] = uinp[i][2]; } } @@ -818,7 +818,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = pval[i]; + pextra[idx] = pval[i]; } } } @@ -846,7 +846,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); - + _compiled=true; #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) @@ -874,13 +874,13 @@ int BaseAmoebaT::add_onefive_neighbors() { int _nall=atom->nall(); int ainum=ans->inum(); int nbor_pitch=nbor->nbor_pitch(); - + k_special15.set_size(GX,BX); k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(), &atom->dev_tag, &dev_nspecial15, &dev_special15, &ainum, &_nall, &nbor_pitch, &_threads_per_atom); - + return GX; } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index fc665ec731..bd30fc3fbb 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -287,7 +287,7 @@ class BaseAmoeba { virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int polar_real(const int eflag, const int vflag) = 0; - + }; } diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index caf910863f..ac221f8376 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -145,7 +145,7 @@ void HippoT::clear() { coeff_amclass.clear(); sp_polar.clear(); sp_nonpolar.clear(); - + this->clear_atomic(); } @@ -199,7 +199,7 @@ int** HippoT::precompute(const int ago, const int inum_full, const int nall, this->dev_special15_t.clear(); this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } if (inum_full==0) { @@ -286,7 +286,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -339,7 +339,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { // Build the short neighbor list for the cutoff off2_disp, // at this point mpole is the first kernel in a time step - + this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), @@ -397,7 +397,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -468,7 +468,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) { // Build the short neighbor list for the cutoff off2_mpole, // at this point mpole is the first kernel in a time step - + this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), @@ -537,7 +537,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - + // ------------------- Resize _fieldp array ------------------------ if (inum_full>this->_max_fieldp_size) { @@ -569,7 +569,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, // --------------------------------------------------------------------------- template int HippoT::udirect2b(const int eflag, const int vflag) { - int ainum=this->ans->inum(); + int ainum=this->ans->inum(); if (ainum == 0) return 0; @@ -591,7 +591,7 @@ int HippoT::udirect2b(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->short_nbor_polar_avail = true; } - + this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &coeff_amclass, &sp_polar, @@ -756,7 +756,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -803,7 +803,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index f643f2b994..b282586efb 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -491,7 +491,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; //int jtype=jx.w; - + // Compute r12 numtyp xr = ix.x - jx.x; numtyp yr = ix.y - jx.y; @@ -499,7 +499,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp r2 = xr*xr + yr*yr + zr*zr; if (r2>off2) continue; - + const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; numtyp dkx = pol1j.y; // rpole[j][1]; @@ -514,7 +514,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp qkyz = pol3j.x; // rpole[j][9]; numtyp qkzz = pol3j.y; // rpole[j][12]; int jtype = pol3j.z; // amtype[j]; - + numtyp sizk = coeff[jtype].x; // sizpr[jtype]; numtyp dmpk = coeff[jtype].y; // dmppr[jtype]; numtyp valk = coeff[jtype].z; // elepr[jtype]; @@ -534,12 +534,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; numtyp qkr = qkx*xr + qky*yr + qkz*zr; - + numtyp dik = dix*dkx + diy*dky + diz*dkz; numtyp qik = qix*qkx + qiy*qky + qiz*qkz; numtyp diqk = dix*qkx + diy*qky + diz*qkz; numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; - numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + qixx*qkxx + qiyy*qkyy + qizz*qkzz; // additional intermediates involving moments and distance @@ -586,11 +586,11 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; - numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); - numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); - numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); // get reciprocal distance terms for this interaction @@ -616,7 +616,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); numtyp term4 = dir*qkr - dkr*qir - 4.0*qik; numtyp term5 = qir*qkr; - numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + + numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8]; // compute the Pauli repulsion energy for this interaction @@ -626,7 +626,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, // calculate intermediate terms for force and torque - numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + + numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + term4*dmpik[8] + term5*dmpik[10]; term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6]; term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6]; @@ -637,23 +637,23 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, // compute the force components for this interaction - numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + term4*qix + term5*qkx + term6*(qixk+qkxi); - numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + term4*qiy + term5*qky + term6*(qiyk+qkyi); - numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + term4*qiz + term5*qkz + term6*(qizk+qkzi); frcx = frcx*rr1 + eterm*rr3*xr; frcy = frcy*rr1 + eterm*rr3*yr; frcz = frcz*rr1 + eterm*rr3*zr; // compute the torque components for this interaction - - numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - + + numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - term4*qirx - term6*(qikrx+qikx); - numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - + numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - term4*qiry - term6*(qikry+qiky); - numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - + numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - term4*qirz - term6*(qikrz+qikz); ttmix = sizik * ttmix * rr1; ttmiy = sizik * ttmiy * rr1; @@ -706,7 +706,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // iioff2) continue; - + int jtype = polar3[j].z; // amtype[j]; int jclass = coeff_amtype[jtype].w; // amtype2class[jtype]; numtyp ck = coeff_amclass[jclass].x; // csix[jclass]; @@ -816,7 +816,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, numtyp dk = ak * r; numtyp expi = ucl_exp(-di); numtyp expk = ucl_exp(-dk); - + numtyp ai2,ak2; numtyp di4,di5; numtyp dk2,dk3; @@ -844,7 +844,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk; - ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + + ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0); } else { @@ -856,7 +856,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, } numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3; - + // apply damping and scaling factors for this interaction numtyp scale = factor_disp * damp*damp; @@ -892,7 +892,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, virial[4] += vzx; virial[5] += vzy; } // nbor - + } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; @@ -1043,12 +1043,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; numtyp qkr = qkx*xr + qky*yr + qkz*zr; - + numtyp dik = dix*dkx + diy*dky + diz*dkz; numtyp qik = qix*qkx + qiy*qky + qiz*qkz; numtyp diqk = dix*qkx + diy*qky + diz*qkz; numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; - numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + qixx*qkxx + qiyy*qkyy + qizz*qkzz; // additional intermediates involving moments and distance @@ -1095,11 +1095,11 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; - numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); - numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); - numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); // get reciprocal distance terms for this interaction @@ -1164,16 +1164,16 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11; rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1; rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3; - numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + - term1i*rr1i + term1k*rr1k + term1ik*rr1ik + - term2i*rr3i + term2k*rr3k + term2ik*rr3ik + + numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + + term1i*rr1i + term1k*rr1k + term1ik*rr1ik + + term2i*rr3i + term2k*rr3k + term2ik*rr3ik + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; // find damped multipole intermediates for force and torque - numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + - term1i*rr3i + term1k*rr3k + term1ik*rr3ik + - term2i*rr5i + term2k*rr5k + term2ik*rr5ik + + numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + + term1i*rr3i + term1k*rr3k + term1ik*rr3ik + + term2i*rr5i + term2k*rr5k + term2ik*rr5ik + term3i*rr7i + term3k*rr7k + term3ik*rr7ik; term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik; term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik; @@ -1187,20 +1187,20 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, // compute the force components for this interaction - numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + term4*qix + term5*qkx + term6*(qixk+qkxi); - numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + term4*qiy + term5*qky + term6*(qiyk+qkyi); - numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + term4*qiz + term5*qkz + term6*(qizk+qkzi); // compute the torque components for this interaction - numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - term4*qirx - term6*(qikrx+qikx); - numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - term4*qiry - term6*(qikry+qiky); - numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - term4*qirz - term6*(qikrz+qikz); // increment force-based gradient and torque on first site @@ -1228,12 +1228,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1408,7 +1408,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, // find the field components for charge penetration damping numtyp dmpi[7],dmpk[7]; dampdir(r,alphai,alphak,dmpi,dmpk); - + numtyp scalek = factor_dscale; numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3; numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5; @@ -1439,7 +1439,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, rr3k*dky + (numtyp)2.0*rr5k*qky; fip[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) - rr3k*dkz + (numtyp)2.0*rr5k*qkz; - + // find terms needed later to compute mutual polarization _fieldp[0] += fid[0]; @@ -1453,7 +1453,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1595,7 +1595,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, } // find terms needed later to compute mutual polarization - // if (poltyp != DIRECT) + // if (poltyp != DIRECT) numtyp dmpik[5]; dampmut(r,alphai,alphak,dmpik); numtyp scalek = factor_wscale; @@ -1610,17 +1610,17 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, tdipdip[3] = -rr3ik + rr5ik*yr*yr; tdipdip[4] = rr5ik*yr*zr; tdipdip[5] = -rr3ik + rr5ik*zr*zr; - //if (i==0 && j == 10) + //if (i==0 && j == 10) // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; - + fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; - + _fieldp[0] += fid[0]; _fieldp[1] += fid[1]; _fieldp[2] += fid[2]; @@ -1632,7 +1632,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; @@ -1905,7 +1905,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, // get the field gradient for direct polarization force - + numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i; numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k; numtyp term1core; @@ -1987,7 +1987,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i; tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k + dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k; - + term2i = rr5i*xr; term1i = zr * term2i; term1core = rr5core*xr*zr; @@ -2039,7 +2039,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp frcx = (numtyp)-2.0 * depx; numtyp frcy = (numtyp)-2.0 * depy; numtyp frcz = (numtyp)-2.0 * depz; - + // get the dEp/dR terms used for direct polarization force // poltyp == MUTUAL && hippo // tixx and tkxx @@ -2108,7 +2108,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // ii { UCL_D_Vec coeff_amtype; /// csix = coeff_amclass.x; adisp = coeff_amclass.y; UCL_D_Vec coeff_amclass; - /// Special polar values [0-4]: + /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, /// sp_polar.z = special_polar_piscale /// sp_polar.w = special_mpole UCL_D_Vec sp_polar; - /// Special nonpolar values [0-4]: + /// Special nonpolar values [0-4]: /// sp_nonpolar.x = special_hal /// sp_nonpolar.y special_repel /// sp_nonpolar.z = special_disp @@ -184,7 +184,7 @@ class Hippo : public BaseAmoeba { int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); int polar_real(const int eflag, const int vflag); - + }; } diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 16b697d88f..982cf894a6 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -129,7 +129,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd) { + double *host_q, double *boxlo, double *prd) { return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, sublo, subhi, tag, nspecial, special, nspecial15, special15, @@ -175,7 +175,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full, int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h index 61bfebc17f..0b8f96f69b 100644 --- a/lib/gpu/lal_hippo_extra.h +++ b/lib/gpu/lal_hippo_extra.h @@ -116,46 +116,46 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term; s = (dampi-tmp)*expk + (dampk+tmp)*expi; - ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - - (numtyp)4.0*dmpi2*dmpk2/term) * expk + + ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; - d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - - ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - - (numtyp)4.0*dmpi2*dmpk2/term) * expk + - (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + - ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - + ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + + ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; - d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - - (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - - (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + - (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + - ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + + d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - + (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - + (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + + ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi; - d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + - dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - - ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - - ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - - (numtyp)4.0*dmpi2*dmpk2/term) * expk + - (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + - dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + - ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + - ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - + ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - + ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + + ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + + ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; - + if (rorder >= 11) { r6 = r5 * r; dmpi26 = dmpi25 * dmpi2; dmpk26 = dmpk25 * dmpk2; - d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + - dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - - (4.0/945.0)*dmpi2*dmpk26*r5/term - - (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - - (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + - dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + - (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + - (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + + d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + + dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - + (4.0/945.0)*dmpi2*dmpk26*r5/term - + (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - + (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + + dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + + (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + + (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; } } @@ -214,7 +214,7 @@ ucl_inline void damppole(const numtyp r, const int rorder, // compute tolerance and exponential damping factors eps = (numtyp)0.001; - diff = alphai-alphak; + diff = alphai-alphak; if (diff < (numtyp)0) diff = -diff; dampi = alphai * r; dampk = alphak * r; @@ -231,7 +231,7 @@ ucl_inline void damppole(const numtyp r, const int rorder, dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; - dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi; if (diff < eps) { dmpk[0] = dmpi[0]; @@ -248,7 +248,7 @@ ucl_inline void damppole(const numtyp r, const int rorder, dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk; - dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk; } @@ -257,21 +257,21 @@ ucl_inline void damppole(const numtyp r, const int rorder, if (diff < eps) { dampi6 = dampi3 * dampi3; dampi7 = dampi3 * dampi4; - dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + + dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + dampi3/(numtyp)48.0)*expi; - dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi; - dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; - dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi; - dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + dampi7/(numtyp)5040.0)*expi; if (rorder >= 11) { dampi8 = dampi4 * dampi4; dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi; } @@ -282,41 +282,41 @@ ucl_inline void damppole(const numtyp r, const int rorder, termk = alphai2 / (alphai2-alphak2); termi2 = termi * termi; termk2 = termk * termk; - dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - + dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk; dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; - dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - - termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk; - dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + - dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - - termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - + dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - + termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk; - dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - - termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + - (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + - (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + + dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + + (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk; - + if (rorder >= 11) { dampi6 = dampi3 * dampi3; dampk6 = dampk3 * dampk3; - dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + - dampi6/(numtyp)1890.0)*expi - - termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + - (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + - dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + + dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + + dampi6/(numtyp)1890.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + + (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + + dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk; } } @@ -404,9 +404,9 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5]) if (diff < eps) { dampi4 = dampi2 * dampi2; dampi5 = dampi2 * dampi3; - dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi; - dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; } else { dampk2 = dampk * dampk; @@ -417,12 +417,12 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5]) termk = alphai2 / (alphai2-alphak2); termi2 = termi * termi; termk2 = termk * termk; - dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - - termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - + dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; - dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - - termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - - (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - + (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk; } } diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index 9e231663c0..9da06cf636 100755 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -91,6 +91,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp action pair_gauss_gpu.h pair_gauss.h action pair_gayberne_gpu.cpp pair_gayberne.cpp action pair_gayberne_gpu.h pair_gayberne.cpp +action pair_hippo_gpu.cpp pair_hippo.cpp +action pair_hippo_gpu.h pair_hippo.cpp action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp action pair_lj96_cut_gpu.h pair_lj96_cut.h action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 91bc679447..e1fe1f1097 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -76,7 +76,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -86,7 +86,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -170,7 +170,7 @@ void PairAmoebaGPU::init_style() maxspecial=atom->maxspecial; maxspecial15=atom->maxspecial15; } - + int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, @@ -207,7 +207,7 @@ void PairAmoebaGPU::multipole_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -239,7 +239,7 @@ void PairAmoebaGPU::multipole_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -281,7 +281,7 @@ void PairAmoebaGPU::induce() // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() - + if (use_ewald) { choose(POLAR_LONG); int nmine = p_kspace->nfft_owned; @@ -317,7 +317,7 @@ void PairAmoebaGPU::induce() memory->create(usump,nlocal,3,"ameoba/induce:usump"); // get the electrostatic field due to permanent multipoles - + dfield0c(field,fieldp); // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only @@ -345,7 +345,7 @@ void PairAmoebaGPU::induce() for (i = 0; i < 10; i++) { printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", i, udir[i][0], udir[i][1], udir[i][2], - udirp[i][0], udirp[i][1], udirp[i][2]); + udirp[i][0], udirp[i][1], udirp[i][2]); } */ // get induced dipoles via the OPT extrapolation method @@ -353,7 +353,7 @@ void PairAmoebaGPU::induce() // uopt,uoptp with a optorder+1 dimension, just optorder ?? // since no need to store optorder+1 values after these loops - if (poltyp == OPT) { + if (poltyp == OPT) { for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uopt[i][0][j] = udir[i][j]; @@ -460,7 +460,7 @@ void PairAmoebaGPU::induce() crstyle = FIELD; comm->reverse_comm_pair(this); } - + //error->all(FLERR,"STOP GPU"); // set initial conjugate gradient residual and conjugate vector @@ -486,7 +486,7 @@ void PairAmoebaGPU::induce() cfstyle = RSD; comm->forward_comm_pair(this); uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -574,7 +574,7 @@ void PairAmoebaGPU::induce() if (pcgprec) { cfstyle = RSD; comm->forward_comm_pair(this); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -629,7 +629,7 @@ void PairAmoebaGPU::induce() if (iter >= politer) done = true; // apply a "peek" iteration to the mutual induced dipoles - + if (done) { for (i = 0; i < nlocal; i++) { term = pcgpeek * poli[i]; @@ -644,7 +644,7 @@ void PairAmoebaGPU::induce() // terminate the calculation if dipoles failed to converge // NOTE: could make this an error - + if (iter >= maxiter || eps > epsold) if (me == 0) error->warning(FLERR,"AMOEBA induced dipoles did not converge"); @@ -652,7 +652,7 @@ void PairAmoebaGPU::induce() // DEBUG output to dump file - if (uind_flag) + if (uind_flag) dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp); // deallocation of arrays @@ -700,7 +700,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) PairAmoeba::udirect2b(field, fieldp); return; } - + int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; int inum, host_start; @@ -753,7 +753,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -764,7 +764,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } - + } /* ---------------------------------------------------------------------- @@ -802,7 +802,7 @@ void PairAmoebaGPU::udirect2b_cpu() firstneigh = list->firstneigh; // NOTE: doesn't this have a problem if aewald is tiny ?? - + aesq2 = 2.0 * aewald * aewald; aesq2n = 0.0; if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); @@ -829,13 +829,13 @@ void PairAmoebaGPU::udirect2b_cpu() pdi = pdamp[itype]; pti = thole[itype]; ddi = dirdamp[itype]; - + // evaluate all sites within the cutoff distance for (jj = 0; jj < jnum; jj++) { jextra = jlist[jj]; j = jextra & NEIGHMASK15; - + xr = x[j][0] - x[i][0]; yr = x[j][1] - x[i][1]; zr = x[j][2] - x[i][2]; @@ -844,7 +844,7 @@ void PairAmoebaGPU::udirect2b_cpu() jtype = amtype[j]; jgroup = amgroup[j]; - + factor_wscale = special_polar_wscale[sbmask15(jextra)]; if (igroup == jgroup) { factor_pscale = special_polar_piscale[sbmask15(jextra)]; @@ -872,7 +872,7 @@ void PairAmoebaGPU::udirect2b_cpu() aefac = aesq2 * aefac; bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; } - + // find terms needed later to compute mutual polarization if (poltyp != DIRECT) { @@ -891,7 +891,7 @@ void PairAmoebaGPU::udirect2b_cpu() scalek = factor_uscale; bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; - + neighptr[n++] = j; tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; tdipdip[ndip++] = bcn[1]*xr*yr; @@ -902,7 +902,7 @@ void PairAmoebaGPU::udirect2b_cpu() } else { if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); } - + } // jj firstneigh_dipole[i] = neighptr; @@ -973,7 +973,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -1001,7 +1001,7 @@ void PairAmoebaGPU::polar_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -1033,7 +1033,7 @@ void PairAmoebaGPU::polar_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1091,11 +1091,11 @@ void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; - vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); - vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); - vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); virial_comp[0] += vxx; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index f4cbf28561..fbc1b6b238 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -88,7 +88,7 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -185,7 +185,7 @@ void PairHippoGPU::init_style() maxspecial=atom->maxspecial; maxspecial15=atom->maxspecial15; } - + int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, @@ -222,7 +222,7 @@ void PairHippoGPU::dispersion_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -250,7 +250,7 @@ void PairHippoGPU::dispersion_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, off2, atom->q, domain->boxlo, domain->prd); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); } @@ -270,7 +270,7 @@ void PairHippoGPU::multipole_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -302,7 +302,7 @@ void PairHippoGPU::multipole_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -344,7 +344,7 @@ void PairHippoGPU::induce() // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() - + if (use_ewald) { choose(POLAR_LONG); int nmine = p_kspace->nfft_owned; @@ -380,7 +380,7 @@ void PairHippoGPU::induce() memory->create(usump,nlocal,3,"ameoba/induce:usump"); // get the electrostatic field due to permanent multipoles - + dfield0c(field,fieldp); // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only @@ -408,7 +408,7 @@ void PairHippoGPU::induce() for (i = 0; i < 10; i++) { printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", i, udir[i][0], udir[i][1], udir[i][2], - udirp[i][0], udirp[i][1], udirp[i][2]); + udirp[i][0], udirp[i][1], udirp[i][2]); } */ // get induced dipoles via the OPT extrapolation method @@ -416,7 +416,7 @@ void PairHippoGPU::induce() // uopt,uoptp with a optorder+1 dimension, just optorder ?? // since no need to store optorder+1 values after these loops - if (poltyp == OPT) { + if (poltyp == OPT) { for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uopt[i][0][j] = udir[i][j]; @@ -523,7 +523,7 @@ void PairHippoGPU::induce() crstyle = FIELD; comm->reverse_comm_pair(this); } - + //error->all(FLERR,"STOP GPU"); // set initial conjugate gradient residual and conjugate vector @@ -549,7 +549,7 @@ void PairHippoGPU::induce() cfstyle = RSD; comm->forward_comm_pair(this); uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -637,7 +637,7 @@ void PairHippoGPU::induce() if (pcgprec) { cfstyle = RSD; comm->forward_comm_pair(this); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -692,7 +692,7 @@ void PairHippoGPU::induce() if (iter >= politer) done = true; // apply a "peek" iteration to the mutual induced dipoles - + if (done) { for (i = 0; i < nlocal; i++) { term = pcgpeek * poli[i]; @@ -707,7 +707,7 @@ void PairHippoGPU::induce() // terminate the calculation if dipoles failed to converge // NOTE: could make this an error - + if (iter >= maxiter || eps > epsold) if (me == 0) error->warning(FLERR,"hippo induced dipoles did not converge"); @@ -715,7 +715,7 @@ void PairHippoGPU::induce() // DEBUG output to dump file - if (uind_flag) + if (uind_flag) dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp); // deallocation of arrays @@ -763,7 +763,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) PairAmoeba::udirect2b(field, fieldp); return; } - + int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; int inum, host_start; @@ -816,7 +816,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -827,7 +827,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } - + } /* ---------------------------------------------------------------------- @@ -865,7 +865,7 @@ void PairHippoGPU::udirect2b_cpu() firstneigh = list->firstneigh; // NOTE: doesn't this have a problem if aewald is tiny ?? - + aesq2 = 2.0 * aewald * aewald; aesq2n = 0.0; if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); @@ -892,13 +892,13 @@ void PairHippoGPU::udirect2b_cpu() pdi = pdamp[itype]; pti = thole[itype]; ddi = dirdamp[itype]; - + // evaluate all sites within the cutoff distance for (jj = 0; jj < jnum; jj++) { jextra = jlist[jj]; j = jextra & NEIGHMASK15; - + xr = x[j][0] - x[i][0]; yr = x[j][1] - x[i][1]; zr = x[j][2] - x[i][2]; @@ -907,7 +907,7 @@ void PairHippoGPU::udirect2b_cpu() jtype = amtype[j]; jgroup = amgroup[j]; - + factor_wscale = special_polar_wscale[sbmask15(jextra)]; if (igroup == jgroup) { factor_pscale = special_polar_piscale[sbmask15(jextra)]; @@ -935,7 +935,7 @@ void PairHippoGPU::udirect2b_cpu() aefac = aesq2 * aefac; bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; } - + // find terms needed later to compute mutual polarization if (poltyp != DIRECT) { @@ -954,7 +954,7 @@ void PairHippoGPU::udirect2b_cpu() scalek = factor_uscale; bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; - + neighptr[n++] = j; tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; tdipdip[ndip++] = bcn[1]*xr*yr; @@ -965,7 +965,7 @@ void PairHippoGPU::udirect2b_cpu() } else { if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); } - + } // jj firstneigh_dipole[i] = neighptr; @@ -1036,7 +1036,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -1064,7 +1064,7 @@ void PairHippoGPU::polar_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -1096,7 +1096,7 @@ void PairHippoGPU::polar_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1156,11 +1156,11 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; - vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); - vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); - vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); virial_comp[0] += vxx;