From 3825fee8e9f25684da2baaebf3575f141c4fd580 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 25 Aug 2021 22:57:37 -0500 Subject: [PATCH 001/181] Added work on amoeba/gpu, some minor changes to PairAmoeba to allow function overriding in PairAmoebaGPU, added the package AMOEBA to cmake/CMakeLists.txt --- cmake/CMakeLists.txt | 2 + lib/gpu/lal_amoeba.cpp | 155 ++++++++ lib/gpu/lal_amoeba.cu | 684 ++++++++++++++++++++++++++++++++++++ lib/gpu/lal_amoeba.h | 87 +++++ lib/gpu/lal_amoeba_ext.cpp | 142 ++++++++ lib/gpu/lal_base_amoeba.cpp | 516 +++++++++++++++++++++++++++ lib/gpu/lal_base_amoeba.h | 225 ++++++++++++ lib/gpu/lal_base_atomic.cpp | 4 +- lib/gpu/lal_base_charge.cpp | 4 +- lib/gpu/lal_base_dipole.cpp | 4 +- lib/gpu/lal_base_dpd.cpp | 5 +- lib/gpu/lal_base_three.cpp | 4 +- lib/gpu/lal_neighbor.cpp | 17 + lib/gpu/lal_neighbor.h | 4 + lib/gpu/lal_neighbor_gpu.cu | 15 + src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/Install.sh | 2 + src/GPU/pair_amoeba_gpu.cpp | 299 ++++++++++++++++ src/GPU/pair_amoeba_gpu.h | 63 ++++ 19 files changed, 2228 insertions(+), 6 deletions(-) create mode 100644 lib/gpu/lal_amoeba.cpp create mode 100644 lib/gpu/lal_amoeba.cu create mode 100644 lib/gpu/lal_amoeba.h create mode 100644 lib/gpu/lal_amoeba_ext.cpp create mode 100644 lib/gpu/lal_base_amoeba.cpp create mode 100644 lib/gpu/lal_base_amoeba.h create mode 100644 src/GPU/pair_amoeba_gpu.cpp create mode 100644 src/GPU/pair_amoeba_gpu.h diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 06297ca919..ccc9902778 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -140,6 +140,7 @@ option(CMAKE_VERBOSE_MAKEFILE "Generate verbose Makefiles" OFF) set(STANDARD_PACKAGES ADIOS + AMOEBA ASPHERE ATC AWPMD @@ -308,6 +309,7 @@ endif() pkg_depends(ML-IAP ML-SNAP) pkg_depends(MPIIO MPI) pkg_depends(ATC MANYBODY) +pkg_depends(AMOEBA KSPACE) pkg_depends(LATBOLTZ MPI) pkg_depends(PHONON KSPACE) pkg_depends(SCAFACOS MPI) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp new file mode 100644 index 0000000000..67f0877e1a --- /dev/null +++ b/lib/gpu/lal_amoeba.cpp @@ -0,0 +1,155 @@ +/*************************************************************************** + amoeba.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Class for acceleration of the amoeba pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "amoeba_cl.h" +#elif defined(USE_CUDART) +const char *amoeba=0; +#else +#include "amoeba_cubin.h" +#endif + +#include "lal_amoeba.h" +#include +namespace LAMMPS_AL { +#define AmoebaT Amoeba + +extern Device device; + +template +AmoebaT::Amoeba() : BaseAmoeba(), + _allocated(false) { +} + +template +AmoebaT::~Amoeba() { + clear(); +} + +template +int AmoebaT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp, + const double *host_thole, const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, const double gpu_split, FILE *_screen, + const double aewald, const double felec, + const double off2, const double polar_dscale, + const double polar_uscale) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, + cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + + UCL_H_Vec host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_pdamp[i]; + host_write[i].y = host_thole[i]; + host_write[i].z = (numtyp)0; + host_write[i].w = (numtyp)0; + } + + damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(damping,host_write,false); + + UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); + sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_polar_wscale[i]; + dview[i].y=host_special_polar_piscale[i]; + dview[i].z=host_special_polar_pscale[i]; + dview[i].w=(numtyp)0; + } + ucl_copy(sp_polar,dview,5,false); + + _aewald = aewald; + _felec = felec; + _off2 = off2; + _polar_dscale = polar_dscale; + _polar_uscale = polar_uscale; + + _allocated=true; + this->_max_bytes=damping.row_bytes() + + sp_polar.row_bytes() + + this->_tep.row_bytes(); + return 0; +} + +template +void AmoebaT::clear() { + if (!_allocated) + return; + _allocated=false; + + damping.clear(); + sp_polar.clear(); + + this->clear_atomic(); +} + +template +double AmoebaT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Amoeba); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int AmoebaT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int _nall=this->atom->nall(); + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + + this->k_polar.set_size(GX,BX); + + this->k_polar.run(&this->atom->x, &this->atom->extra, + &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, + &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + return GX; +} + +template class Amoeba; +} diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu new file mode 100644 index 0000000000..fbda1e0787 --- /dev/null +++ b/lib/gpu/lal_amoeba.cu @@ -0,0 +1,684 @@ +// ************************************************************************** +// amoeba.cu +// ------------------- +// Trung Dac Nguyen (Northwestern) +// +// Device code for acceleration of the amoeba pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : trung.nguyen@northwestern.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include +#include "lal_aux_fun1.h" +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#include "inttypes.h" +#define tagint int64_t +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define local_allocate_store_ufld() \ + __local acctyp red_acc[6][BLOCK_PAIR]; + +#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset, \ + i, tep) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=ufld[0]; \ + red_acc[1][tid]=ufld[1]; \ + red_acc[2][tid]=ufld[2]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + ufld[0]=red_acc[0][tid]; \ + ufld[1]=red_acc[1][tid]; \ + ufld[2]=red_acc[2][tid]; \ + red_acc[0][tid]=dufld[0]; \ + red_acc[1][tid]=dufld[1]; \ + red_acc[2][tid]=dufld[2]; \ + red_acc[3][tid]=dufld[3]; \ + red_acc[4][tid]=dufld[4]; \ + red_acc[5][tid]=dufld[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + dufld[0]=red_acc[0][tid]; \ + dufld[1]=red_acc[1][tid]; \ + dufld[2]=red_acc[2][tid]; \ + dufld[3]=red_acc[3][tid]; \ + dufld[4]=red_acc[4][tid]; \ + dufld[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + ufld[0] += shfl_down(ufld[0], s, t_per_atom); \ + ufld[1] += shfl_down(ufld[1], s, t_per_atom); \ + ufld[2] += shfl_down(ufld[2], s, t_per_atom); \ + dufld[0] += shfl_down(dufld[0], s, t_per_atom); \ + dufld[1] += shfl_down(dufld[1], s, t_per_atom); \ + dufld[2] += shfl_down(dufld[2], s, t_per_atom); \ + dufld[3] += shfl_down(dufld[3], s, t_per_atom); \ + dufld[4] += shfl_down(dufld[4], s, t_per_atom); \ + dufld[5] += shfl_down(dufld[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && iioff2) continue; + + numtyp r = ucl_sqrt(r2); + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + numtyp ukx = polar4[j].x; // uind[j][0]; + numtyp uky = polar4[j].y; // uind[j][1]; + numtyp ukz = polar4[j].z; // uind[j][2]; + numtyp ukxp = polar5[j].x; // uinp[j][0]; + numtyp ukyp = polar5[j].y; // uinp[j][1]; + numtyp ukzp = polar5[j].z; // uinp[j][2]; + + numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + numtyp uir = uix*xr + uiy*yr + uiz*zr; + numtyp uirp = uixp*xr + uiyp*yr + uizp*zr; + numtyp ukr = ukx*xr + uky*yr + ukz*zr; + numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr; + + // get reciprocal distance terms for this interaction + + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = felec * rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + numtyp rr9 = (numtyp)7.0 * rr7 * r2inv; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; + numtyp alsq2n = (numtyp)0.0; + if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m <= 4; m++) { + bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2; + } + for (m = 0; m < 5; m++) bn[m] *= felec; + + // apply Thole polarization damping to scale factors + + numtyp sc3 = (numtyp)1.0; + numtyp sc5 = (numtyp)1.0; + numtyp sc7 = (numtyp)1.0; + for (k = 0; k < 3; k++) { + rc3[k] = (numtyp)0.0; + rc5[k] = (numtyp)0.0; + rc7[k] = (numtyp)0.0; + } + + // apply Thole polarization damping to scale factors + + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + sc3 = (numtyp)1.0 - expdamp; + sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp; + sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp; + numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv; + numtyp temp5 = damp; + numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp; + rc3[0] = xr * temp3; + rc3[1] = yr * temp3; + rc3[2] = zr * temp3; + rc5[0] = rc3[0] * temp5; + rc5[1] = rc3[1] * temp5; + rc5[2] = rc3[2] * temp5; + rc7[0] = rc5[0] * temp7; + rc7[1] = rc5[1] * temp7; + rc7[2] = rc5[2] * temp7; + } + + psc3 = (numtyp)1.0 - sc3*factor_pscale; + psc5 = (numtyp)1.0 - sc5*factor_pscale; + psc7 = (numtyp)1.0 - sc7*factor_pscale; + dsc3 = (numtyp)1.0 - sc3*factor_dscale; + dsc5 = (numtyp)1.0 - sc5*factor_dscale; + dsc7 = (numtyp)1.0 - sc7*factor_dscale; + usc3 = (numtyp)1.0 - sc3*factor_uscale; + usc5 = (numtyp)1.0 - sc5*factor_uscale; + psr3 = bn[1] - psc3*rr3; + psr5 = bn[2] - psc5*rr5; + psr7 = bn[3] - psc7*rr7; + dsr3 = bn[1] - dsc3*rr3; + dsr5 = bn[2] - dsc5*rr5; + dsr7 = bn[3] - dsc7*rr7; + usr5 = bn[2] - usc5*rr5; + for (k = 0; k < 3; k++) { + prc3[k] = rc3[k] * factor_pscale; + prc5[k] = rc5[k] * factor_pscale; + prc7[k] = rc7[k] * factor_pscale; + drc3[k] = rc3[k] * factor_dscale; + drc5[k] = rc5[k] * factor_dscale; + drc7[k] = rc7[k] * factor_dscale; + urc3[k] = rc3[k] * factor_uscale; + urc5[k] = rc5[k] * factor_uscale; + } + } else { // damp == 0: ??? + } + + // get the induced dipole field used for dipole torques + + numtyp tix3 = psr3*ukx + dsr3*ukxp; + numtyp tiy3 = psr3*uky + dsr3*ukyp; + numtyp tiz3 = psr3*ukz + dsr3*ukzp; + numtyp tuir = -psr5*ukr - dsr5*ukrp; + + ufld[0] += tix3 + xr*tuir; + ufld[1] += tiy3 + yr*tuir; + ufld[2] += tiz3 + zr*tuir; + + // get induced dipole field gradient used for quadrupole torques + + numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp); + numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp); + numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp); + tuir = -psr7*ukr - dsr7*ukrp; + + dufld[0] += xr*tix5 + xr*xr*tuir; + dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; + dufld[2] += yr*tiy5 + yr*yr*tuir; + dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; + dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; + dufld[5] += zr*tiz5 + zr*zr*tuir; + + // get the dEd/dR terms used for direct polarization force + + term1 = bn[2] - dsc3*rr5; + term2 = bn[3] - dsc5*rr7; + term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr - dsr5*xr; + term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0]; + term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr; + numtyp tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6; + numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr - dsr5*yr; + term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1]; + term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr; + numtyp tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2]; + term4 = rr3*drc3[2] - term1*zr - dsr5*zr; + term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2]; + term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2]; + term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr; + numtyp tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*xr*yr - rr3*yr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0]; + term7 = rr5*drc5[0] - term2*xr; + numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6; + + term3 = term1*xr*zr - rr3*zr*drc3[0]; + term5 = term2*xr*zr - rr5*zr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0]; + numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*yr*zr - rr3*zr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1]; + term7 = rr5*drc5[1] - term2*yr; + numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp; + numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp; + numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp; + + numtyp frcx = depx; + numtyp frcy = depy; + numtyp frcz = depz; + + // get the dEp/dR terms used for direct polarization force + + // tixx and tkxx + term1 = bn[2] - psc3*rr5; + term2 = bn[3] - psc5*rr7; + term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr - psr5*xr; + term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0]; + term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr; + tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; + tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + // tiyy and tkyy + term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr - psr5*yr; + term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1]; + term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr; + tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + // tizz and tkzz + term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2]; + term4 = rr3*prc3[2] - term1*zr - psr5*zr; + term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2]; + term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2]; + term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr; + tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tixy and tkxy + term3 = term1*xr*yr - rr3*yr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0]; + term7 = rr5*prc5[0] - term2*xr; + tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6; + + // tixz and tkxz + term3 = term1*xr*zr - rr3*zr*prc3[0]; + term5 = term2*xr*zr - rr5*zr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0]; + tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tiyz and tkyz + term3 = term1*yr*zr - rr3*zr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1]; + term7 = rr5*prc5[1] - term2*yr; + tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz; + depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz; + depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + // get the dtau/dr terms used for mutual polarization force + // poltyp == MUTUAL && amoeba + + term1 = bn[2] - usc3*rr5; + term2 = bn[3] - usc5*rr7; + term3 = usr5 + term1; + term4 = rr3 * factor_uscale; + term5 = -xr*term3 + rc3[0]*term4; + term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0]; + tixx = uix*term5 + uir*term6; + tkxx = ukx*term5 + ukr*term6; + + term5 = -yr*term3 + rc3[1]*term4; + term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1]; + tiyy = uiy*term5 + uir*term6; + tkyy = uky*term5 + ukr*term6; + + term5 = -zr*term3 + rc3[2]*term4; + term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2]; + tizz = uiz*term5 + uir*term6; + tkzz = ukz*term5 + ukr*term6; + + term4 = -usr5 * yr; + term5 = -xr*term1 + rr3*urc3[0]; + term6 = xr*yr*term2 - rr5*yr*urc5[0]; + tixy = uix*term4 + uiy*term5 + uir*term6; + tkxy = ukx*term4 + uky*term5 + ukr*term6; + + term4 = -usr5 * zr; + term6 = xr*zr*term2 - rr5*zr*urc5[0]; + tixz = uix*term4 + uiz*term5 + uir*term6; + tkxz = ukx*term4 + ukz*term5 + ukr*term6; + + term5 = -yr*term1 + rr3*urc3[1]; + term6 = yr*zr*term2 - rr5*zr*urc5[1]; + tiyz = uiy*term4 + uiz*term5 + uir*term6; + tkyz = uky*term4 + ukz*term5 + ukr*term6; + + depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + + tkxx*uixp + tkxy*uiyp + tkxz*uizp; + depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + + tkxy*uixp + tkyy*uiyp + tkyz*uizp; + depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + + tkxz*uixp + tkyz*uiyp + tkzz*uizp; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + + if (EVFLAG && vflag) { + numtyp vxx = xr * frcx; + numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz); + numtyp vyy = yr * frcy; + numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz); + numtyp vzz = zr * frcz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vxy; + virial[4] += vxz; + virial[5] += vyz; + } + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; k +class Amoeba : public BaseAmoeba { + public: + Amoeba(); + ~Amoeba(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const int max_amtype, const double *host_pdamp, + const double *host_thole, const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *_screen, + const double aewald, const double felec, + const double off2, const double polar_dscale, + const double polar_uscale); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// pdamp = damping.x; thole = damping.y + UCL_D_Vec damping; + /// Special polar values [0-4]: + /// sp_polar.x = special_polar_wscale + /// sp_polar.y special_polar_pscale, + /// sp_polar.z = special_polar_piscale + UCL_D_Vec sp_polar; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale; + numtyp _qqrd2e; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp new file mode 100644 index 0000000000..27c35a810f --- /dev/null +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -0,0 +1,142 @@ +/*************************************************************************** + amoeba_ext.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Functions for LAMMPS access to amoeba acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_amoeba.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Amoeba AMOEBAMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int amoeba_gpu_init(const int ntypes, const int max_amtype, + const double *host_pdamp, const double *host_thole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double aewald, const double felec, + const double off2, const double polar_dscale, + const double polar_uscale, int& tep_size) { + AMOEBAMF.clear(); + gpu_mode=AMOEBAMF.device->gpu_mode(); + double gpu_split=AMOEBAMF.device->particle_split(); + int first_gpu=AMOEBAMF.device->first_device(); + int last_gpu=AMOEBAMF.device->last_device(); + int world_me=AMOEBAMF.device->world_me(); + int gpu_rank=AMOEBAMF.device->gpu_rank(); + int procs_per_gpu=AMOEBAMF.device->procs_per_gpu(); + + tep_size=sizeof(PRECISION); + + AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu); + + bool message=false; + if (AMOEBAMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, + host_special_polar_wscale, host_special_polar_piscale, + host_special_polar_pscale, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, screen, + aewald, felec, off2, polar_dscale, polar_uscale); + + AMOEBAMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + AMOEBAMF.estimate_gpu_overhead(); + return init_ok; +} + +void amoeba_gpu_clear() { + AMOEBAMF.clear(); +} + +int** amoeba_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void **tep_ptr) { + return AMOEBAMF.compute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd, tep_ptr); +} + +void amoeba_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd, void **tep_ptr) { + AMOEBAMF.compute(ago,inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + ilist, numj, firstneigh, eflag, vflag, eatom, vatom, + host_start, cpu_time, success, host_q, nlocal, boxlo, prd, tep_ptr); +} + +double amoeba_gpu_bytes() { + return AMOEBAMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp new file mode 100644 index 0000000000..c5f4a01222 --- /dev/null +++ b/lib/gpu/lal_base_amoeba.cpp @@ -0,0 +1,516 @@ +/*************************************************************************** + base_amoeba.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include "lal_base_amoeba.h" +namespace LAMMPS_AL { +#define BaseAmoebaT BaseAmoeba + +extern Device global_device; + +template +BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); + pair_program=nullptr; + ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif +} + +template +BaseAmoebaT::~BaseAmoeba() { + delete ans; + delete nbor; + k_polar.clear(); + k_special15.clear(); + if (pair_program) delete pair_program; +} + +template +int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseAmoebaT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const int maxspecial15, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_charge(); + + bool charge = true; + bool rot = false; + bool vel = false; + _extra_fields = 24; // round up to accomodate quadruples of numtyp values + // rpole 13; uind 3; uinp 3; amtype, amgroup + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields); + if (success!=0) + return success; + + if (ucl_device!=device->gpu) _compiled=false; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + _block_bio_size=device->block_bio_pair(); + compile_kernels(*ucl_device,pair_program,k_name); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + _maxspecial=maxspecial; + _maxspecial15=maxspecial15; + + // allocate per-atom array tep + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_tep_size=static_cast(static_cast(ef_nall)*1.10); + _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + + return success; +} + +template +void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) { + device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead); +} + +template +void BaseAmoebaT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); + + _tep.clear(); + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return nullptr; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + + // add one-five neighbors + + if (_maxspecial15>0) { + UCL_H_Vec view_nspecial15; + UCL_H_Vec view_special15; + view_nspecial15.view(nspecial15,nall,*ucl_device); + view_special15.view(special15[0],nall*_maxspecial15,*ucl_device); + ucl_copy(dev_nspecial15,view_nspecial15,nall,false); + ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false); + nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall); + + add_onefive_neighbors(); + } + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + int *ilist, int *numj, int **firstneigh, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd, void **tep_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // ------------------- Resize _tep array ------------------------ + + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); + + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + } + + *tep_ptr=_tep.host.begin(); + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + // packing host arrays into host_extra + + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_q_data(); + atom->add_extra_data(); + + device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, + boxlo, prd); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void **tep_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // ------------------- Resize _tep array ------------------------ + + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); + + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + } + *tep_ptr=_tep.host.begin(); + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + success); + if (!success) + return nullptr; + atom->cast_q_data(host_q); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_q_data(); + atom->add_extra_data(); + + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, + boxlo, prd); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + // copy tep from device to host + + _tep.update_host(_max_tep_size*4,false); +/* + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_tep[4*i]); + printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseAmoebaT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseAmoeba); +} + +template +void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, + double** uind, double** uinp) { + int _nall=atom->nall(); + numtyp *pextra=reinterpret_cast(&(atom->extra[0])); + + int n = 0; + int nstride = 4; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][0]; + pextra[idx+1] = rpole[i][1]; + pextra[idx+2] = rpole[i][2]; + pextra[idx+3] = rpole[i][3]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][4]; + pextra[idx+1] = rpole[i][5]; + pextra[idx+2] = rpole[i][6]; + pextra[idx+3] = rpole[i][8]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][9]; + pextra[idx+1] = rpole[i][12]; + pextra[idx+2] = (numtyp)amtype[i]; + pextra[idx+3] = (numtyp)amgroup[i]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = uind[i][0]; + pextra[idx+1] = uind[i][1]; + pextra[idx+2] = uind[i][2]; + } + + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = uinp[i][0]; + pextra[idx+1] = uinp[i][1]; + pextra[idx+2] = uinp[i][2]; + } +} + +template +void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname) { + if (_compiled) + return; + + if (pair_program) delete pair_program; + pair_program=new UCL_Program(dev); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); + + k_polar.set_function(*pair_program,kname); + k_special15.set_function(*pair_program,"k_special15"); + pos_tex.get_texture(*pair_program,"pos_tex"); + q_tex.get_texture(*pair_program,"q_tex"); + + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.has_subgroup_support()) { + size_t mx_subgroup_sz = k_polar.max_subgroup_size(_block_size); + if (_threads_per_atom > mx_subgroup_sz) + _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + +} + +template +int BaseAmoebaT::add_onefive_neighbors() { + // Compute the block size and grid size to keep all cores busy + const int BX=block_size(); + int GX=static_cast(ceil(static_cast(ans->inum())/ + (BX/_threads_per_atom))); + + int _nall=atom->nall(); + int ainum=ans->inum(); + int nbor_pitch=nbor->nbor_pitch(); + + k_special15.set_size(GX,BX); + k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(), + &atom->dev_tag, &dev_nspecial15, &dev_special15, + &ainum, &_nall, &nbor_pitch, + &_threads_per_atom); + + return GX; +} + +template class BaseAmoeba; +} diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h new file mode 100644 index 0000000000..ac9c23e8a9 --- /dev/null +++ b/lib/gpu/lal_base_amoeba.h @@ -0,0 +1,225 @@ +/*************************************************************************** + base_amoeba.h + ------------------- + Trung Dac Nguyen (Northwestern) + + Base class for pair styles needing per-particle data for position, + charge, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#ifndef LAL_BASE_AMOEBA_H +#define LAL_BASE_AMOEBA_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#if defined(USE_OPENCL) +#include "geryon/ocl_texture.h" +#elif defined(USE_CUDART) +#include "geryon/nvc_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseAmoeba { + public: + BaseAmoeba(); + virtual ~BaseAmoeba(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *screen, + const void *pair_program, const char *k_name); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(const int add_kernels=0); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + q_tex.bind_float(atom->q,1); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(screen); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint **special15, + bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd, void **tep_ptr); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, void **tep_ptr); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + UCL_Vector polar1, polar2, polar3, polar4, polar5; + + /// cast host arrays into a single array for atom->extra + void cast_extra_data(int* amtype, int* amgroup, double** rpole, + double** uind, double** uinp); + + /// Per-atom arrays + UCL_Vector _tep; + int _max_tep_size; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + /// Device storage for 1-5 special neighbor counts + UCL_D_Vec dev_nspecial15; + /// Device storage for special neighbors + UCL_D_Vec dev_special15, dev_special15_t; + + int add_onefive_neighbors(); + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program; + UCL_Kernel k_polar,k_special15; + inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) {} + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture q_tex; + + protected: + bool _compiled; + int _block_size, _block_bio_size, _threads_per_atom; + int _extra_fields; + double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + + virtual int loop(const int eflag, const int vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index 6aad138aa1..bda9441c5b 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_atom(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + bool charge = false; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index 9045420425..5c236873d0 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_charge(); - int success=device->init(*ans,true,false,nlocal,nall,maxspecial); + bool charge = true; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 439637cbde..71650ebf7e 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_charge(); - int success=device->init(*ans,true,true,nlocal,nall,maxspecial); + bool charge = true; + bool rot = true; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index d3c3353415..07c11caf8f 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, _threads_per_atom=device->threads_per_atom(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true); + bool charge = false; + bool rot = false; + bool vel = true; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel); if (success!=0) return success; diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index 15ef20230d..9dfee9b8c9 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall, else _threads_per_atom=device->threads_per_three(); - int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + bool charge = false; + bool rot = false; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); if (success!=0) return success; diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index a0d2eaa8c3..4e65a58003 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -579,6 +579,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_nbor.stop(); if (_time_device) time_nbor.add_to_total(); + + // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial) + // on the device, transpose the matrix (1-d array) for coalesced reads + // dev_special[i][j] = the special i neighbor of atom j + time_transpose.start(); const int b2x=_block_cell_2d; const int b2y=_block_cell_2d; @@ -682,6 +687,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, if (_cutoff < _cell_size) vadjust*=1.46; mn=std::max(mn,static_cast(ceil(_max_neighbor_factor*vadjust*mn))); if (mn<33) mn+=3; + resize_max_neighbors(mn,success); set_nbor_block_size(mn/2); if (!success) @@ -834,6 +840,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_nbor.stop(); } +void Neighbor::transpose(UCL_D_Vec &out, const UCL_D_Vec &in, + const int columns_in, const int rows_in) +{ + const int b2x=_block_cell_2d; + const int b2y=_block_cell_2d; + const int g2x=static_cast(ceil(static_cast(columns_in)/b2x)); + const int g2y=static_cast(ceil(static_cast(rows_in)/b2y)); + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&out, &in, &columns_in, &rows_in); +} + template void Neighbor::build_nbor_list (double **x, const int inum, const int host_inum, const int nall, Atom &atom, double *sublo, double *subhi, diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index c1e1a87ef4..97aec4e280 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -260,6 +260,10 @@ class Neighbor { return o.str(); } + /// Helper function + void transpose(UCL_D_Vec &out, const UCL_D_Vec &in, + const int columns_in, const int rows_in); + private: NeighborShared *_shared; UCL_Device *dev; diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 6fd724b494..144e9fa284 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -44,6 +44,19 @@ _texture_2d( pos_tex,int4); #define LAL_USE_OLD_NEIGHBOR #endif +/* + compute the id of the cell where the atoms belong to +x: atom coordinates +cell_id: cell ids +particle_id: +boxlo[0-2]: the lower left corner of the local box +ncell[xyz]: the number of cells in xyz dims +i_cell_size is the inverse cell size +inum = the number of the local atoms that are ported to the device +nall = the number of the local+ghost atoms that are ported to the device +cells_in_cutoff = the number of cells that are within the cutoff +*/ + __kernel void calc_cell_id(const numtyp4 *restrict x_, unsigned *restrict cell_id, int *restrict particle_id, @@ -86,6 +99,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_, } } +// compute the number of atoms in each cell + __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id, int *restrict cell_counts, int nall, int ncell) { diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index b28a00fb84..4644d4a137 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -357,7 +357,7 @@ class PairAmoeba : public Pair { void polar(); void polar_energy(); - void polar_real(); + virtual void polar_real(); void polar_kspace(); void damppole(double, int, double, double, double *, double *, double *); diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index a87d2165d9..9e231663c0 100755 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -41,6 +41,8 @@ action fix_npt_gpu.cpp action fix_nve_asphere_gpu.h fix_nve_asphere.h action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp action gpu_extra.h +action pair_amoeba_gpu.cpp pair_amoeba.cpp +action pair_amoeba_gpu.h pair_amoeba.h action pair_beck_gpu.cpp pair_beck.cpp action pair_beck_gpu.h pair_beck.h action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp new file mode 100644 index 0000000000..4f1b20d364 --- /dev/null +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -0,0 +1,299 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (Northwestern) +------------------------------------------------------------------------- */ + +#include "pair_amoeba_gpu.h" + +#include "atom.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int amoeba_gpu_init(const int ntypes, const int max_amtype, + const double *host_pdamp, const double *host_thole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double aewald, const double felec, + const double off2, const double polar_dscale, + const double polar_uscale, int& tep_size); +void amoeba_gpu_clear(); + +int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd, + void **tep_ptr); +void amoeba_gpu_compute(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd, void **tep_ptr); + +double amoeba_gpu_bytes(); + +enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; + +/* ---------------------------------------------------------------------- */ + +PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairAmoebaGPU::~PairAmoebaGPU() +{ + amoeba_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairAmoebaGPU::polar_real() +{ + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + firstneigh = amoeba_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, + rpole, uind, uinp, sublo, subhi, + atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd, &tep_pinned); + + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + amoeba_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + amtype, amgroup, rpole, uind, uinp, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd, &tep_pinned); + } + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (tep_single) { + float *tep_ptr = (float *)tep_pinned; + compute_force_from_tep(tep_ptr); + } else { + double *tep_ptr = (double *)tep_pinned; + compute_force_from_tep(tep_ptr); + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +template +void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr) +{ + int i,ix,iy,iz; + double ci,dix,diy,diz; + double qixx,qixy,qixz; + double qiyy,qiyz,qizz; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double fix[3],fiy[3],fiz[3],tep[4]; + + double** x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + dix = rpole[i][1]; + diy = rpole[i][2]; + diz = rpole[i][3]; + qixx = rpole[i][4]; + qixy = rpole[i][5]; + qixz = rpole[i][6]; + qiyy = rpole[i][8]; + qiyz = rpole[i][9]; + qizz = rpole[i][12]; + + tep[0] = tep_ptr[4*i]; + tep[1] = tep_ptr[4*i+1]; + tep[2] = tep_ptr[4*i+2]; + torque2force(i,tep,fix,fiy,fiz,fpolar); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + + virpolar[0] += vxx; + virpolar[1] += vyy; + virpolar[2] += vzz; + virpolar[3] += vxy; + virpolar[4] += vxz; + virpolar[5] += vyz; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::init_style() +{ + PairAmoeba::init_style(); + + if (gpu_mode == GPU_FORCE) { + if (comm->me == 0) + error->warning(FLERR,"Pair style amoeba/gpu does not support neigh no " + "for now, automatically switching to neigh yes"); + gpu_mode = GPU_NEIGH; + } + + // Repeat cutsq calculation because done after call to init_style + + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + + // select the cutoff (off2) for neighbor list builds (the polar term for now) + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + double cell_size = sqrt(off2) + neighbor->skin; + + int maxspecial=0; + int maxspecial15=0; + if (atom->molecular != Atom::ATOMIC) { + maxspecial=atom->maxspecial; + maxspecial15=atom->maxspecial15; + } + + int tep_size; + int mnf = 5e-2 * neighbor->oneatom; + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, atom->nlocal, + atom->nlocal+atom->nghost, mnf, maxspecial, + maxspecial15, cell_size, gpu_mode, screen, + aewald, felec, off2, polar_dscale, polar_uscale, + tep_size); + GPU_EXTRA::check_flag(success,error,world); + + if (tep_size == sizeof(double)) + tep_single = false; + else + tep_single = true; +} + +/* ---------------------------------------------------------------------- */ + +double PairAmoebaGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + amoeba_gpu_bytes(); +} diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h new file mode 100644 index 0000000000..4d29bfaf34 --- /dev/null +++ b/src/GPU/pair_amoeba_gpu.h @@ -0,0 +1,63 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(amoeba/gpu,PairAmoebaGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_AMOEBA_GPU_H +#define LMP_PAIR_AMOEBA_GPU_H + +#include "pair_amoeba.h" + +namespace LAMMPS_NS { + +class PairAmoebaGPU : public PairAmoeba { + public: + PairAmoebaGPU(LAMMPS *lmp); + ~PairAmoebaGPU(); + void init_style(); + double memory_usage(); + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + virtual void polar_real(); + + private: + int gpu_mode; + double cpu_time; + void *tep_pinned; + bool tep_single; + + template + void compute_force_from_tep(const numtyp*); +}; + +} // namespace LAMMPS_NS +#endif +#endif + +/* ERROR/WARNING messages: + +E: Insufficient memory on accelerator + +There is insufficient memory on one of the devices specified for the gpu +package + +E: Pair style amoeba/gpu requires atom attribute q + +The atom style defined does not have this attribute. + +*/ From db92844228b555938a85ceb2d6f893010e5c5954 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 25 Aug 2021 23:22:23 -0500 Subject: [PATCH 002/181] Added recent changes to FixGPU to enable newton_pair on --- src/GPU/fix_gpu.cpp | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 71ab3f4cb4..66b938c577 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -127,7 +127,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : _gpu_mode = GPU_NEIGH; _particle_split = 1.0; int nthreads = 0; - int newtonflag = 0; + int newtonflag = force->newton_pair; int threads_per_atom = -1; double binsize = 0.0; char *opencl_args = nullptr; @@ -211,14 +211,16 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) : #endif // set newton pair flag - // require newtonflag = 0 since currently required by all GPU pair styles - - if (newtonflag == 1) error->all(FLERR,"Illegal package gpu command"); force->newton_pair = newtonflag; if (force->newton_pair || force->newton_bond) force->newton = 1; else force->newton = 0; + // require newton pair off if _particle_split < 1 + + if (force->newton_pair == 1 && _particle_split < 1) + error->all(FLERR,"Cannot use newton pair on for split less than 1"); + if (pair_only_flag) { lmp->suffixp = lmp->suffix; lmp->suffix = nullptr; @@ -341,7 +343,23 @@ void FixGPU::post_force(int /* vflag */) force->pair->virial[4] += lvirial[4]; force->pair->virial[5] += lvirial[5]; - if (force->pair->vflag_fdotr) force->pair->virial_fdotr_compute(); + // for newton pair off: force->pair->vflag_fdotr = 0 + // which has been the case so far, virial_fdotr_compute() is never called + // for newton pair on: force->pair->vflag_fdotr = 1 + // for neigh yes: full neighbor lists are built on the device + // for neigh no: full neighbor lists are built on the host + // either way the virial is tallied to force->pair->virial as above + // so as long as _particle_split == 1 + // no need to call force->pair->virial_fdotr_compute(); + // If _particle_split < 1, the local atom forces computed by + // the gpu pair styles on the host (cpu_compute()) got tallied + // by comm->reverse_comm() (which is done before this post_force() function). + // A call to force->pair->virial_fdotr_compute() would double count + // the virial from the local atoms on the host. + // Here a possible workaround is to comment out the below command + // while enforcing newton pair off for _particle_split < 1. + + //if (force->pair->vflag_fdotr) force->pair->virial_fdotr_compute(); timer->stamp(Timer::PAIR); } From 91317b2879e72589a4b62868f05fceb98b48f3b7 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 26 Aug 2021 09:33:20 -0500 Subject: [PATCH 003/181] Added changes to Atom and Device classes for allocation of extra fields and SBBITS15 and NEIGHMASK15 --- lib/gpu/lal_atom.cpp | 36 ++++++++++++++++++++++++++--- lib/gpu/lal_atom.h | 46 ++++++++++++++++++++++++++++++++++---- lib/gpu/lal_device.cpp | 11 ++++++--- lib/gpu/lal_device.h | 3 ++- lib/gpu/lal_preprocessor.h | 4 ++++ 5 files changed, 89 insertions(+), 11 deletions(-) diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index cda4d383b5..618ffb0106 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const { bytes+=sizeof(numtyp); if (_vel) bytes+=4*sizeof(numtyp); + if (_extra_fields>0) + bytes+=_extra_fields*sizeof(numtyp); return bytes; } @@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) { UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=v.device.row_bytes(); } + if (_extra_fields>0 && _host_view==false) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } if (_gpu_nbor>0) { if (_bonds) { @@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) { template bool AtomT::add_fields(const bool charge, const bool rot, - const int gpu_nbor, const bool bonds, const bool vel) { + const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { bool success=true; // Ignore host/device transfers? int gpu_bytes=0; @@ -191,6 +199,16 @@ bool AtomT::add_fields(const bool charge, const bool rot, } } + if (extra_fields > 0 && _extra_fields==0) { + _extra_fields=extra_fields; + _other=true; + if (_host_view==false) { + success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, + UCL_READ_ONLY)==UCL_SUCCESS); + gpu_bytes+=extra.device.row_bytes(); + } + } + if (bonds && _bonds==false) { _bonds=true; if (_bonds && _gpu_nbor>0) { @@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot, template bool AtomT::init(const int nall, const bool charge, const bool rot, - UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) { + UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel, + const int extra_fields) { clear(); bool success=true; @@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, _q_avail=false; _quat_avail=false; _v_avail=false; + _extra_avail=false; _resized=false; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; _vel=vel; - _other=_charge || _rot || _vel; + _extra_fields=extra_fields; + _other=_charge || _rot || _vel || (extra_fields>0); dev=&devi; _time_transfer=0; @@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot, time_q.init(*dev); time_quat.init(*dev); time_vel.init(*dev); + time_extra.init(*dev); + time_pos.zero(); time_q.zero(); time_quat.zero(); time_vel.zero(); + time_extra.zero(); + _time_cast=0.0; #ifdef GPU_CAST @@ -308,6 +333,8 @@ void AtomT::clear_resize() { quat.clear(); if (_vel) v.clear(); + if (_extra_fields>0) + extra.clear(); dev_cell_id.clear(); dev_particle_id.clear(); @@ -350,6 +377,7 @@ void AtomT::clear() { time_q.clear(); time_quat.clear(); time_vel.clear(); + time_extra.clear(); clear_resize(); #ifdef GPU_CAST @@ -370,6 +398,8 @@ double AtomT::host_memory_usage() const { atom_bytes+=4; if (_vel) atom_bytes+=4; + if (_extra_fields>0) + atom_bytes+=_extra_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 3cf97d94a0..ff335fffa9 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -76,7 +76,7 @@ class Atom { * gpu_nbor 2 if binning on host and neighboring on device **/ bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ @@ -96,7 +96,7 @@ class Atom { * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, - const bool bonds, const bool vel=false); + const bool bonds, const bool vel=false, const int extra_fields=0); /// Returns true if GPU is using charges bool charge() { return _charge; } @@ -107,6 +107,9 @@ class Atom { /// Returns true if GPU is using velocities bool velocity() { return _vel; } + /// Returns true if GPU is using extra fields + bool using_extra() { return _extra_fields; } + /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -450,6 +453,38 @@ class Atom { add_v_data(host_ptr,host_tag); } + // Cast extras to write buffer + template + inline void cast_extra_data(cpytyp *host_ptr) { + if (_extra_avail==false) { + double t=MPI_Wtime(); + if (_host_view) { + extra.host.view((numtyp*)host_ptr,_nall*_extra_fields,*dev); + extra.device.view(extra.host); + } else if (sizeof(numtyp)==sizeof(double)) + memcpy(extra.host.begin(),host_ptr,_nall*_extra_fields*sizeof(numtyp)); + else + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i=0; i<_nall*_extra_fields; i++) extra[i]=host_ptr[i]; + _time_cast+=MPI_Wtime()-t; + } + } + + // Copy extras to device + /** Copies nall()*_extra elements **/ + inline void add_extra_data() { + time_extra.start(); + if (_extra_avail==false) { + extra.update_device(_nall*_extra_fields,true); + _extra_avail=true; + } + time_extra.stop(); + } + /// Add in casting time from additional data (seconds) inline void add_cast_time(double t) { _time_cast+=t; } @@ -473,6 +508,8 @@ class Atom { UCL_Vector quat; /// Velocities UCL_Vector v; + /// Extras + UCL_Vector extra; #ifdef GPU_CAST UCL_Vector x_cast; @@ -493,7 +530,7 @@ class Atom { UCL_H_Vec host_particle_id; /// Device timers - UCL_Timer time_pos, time_q, time_quat, time_vel; + UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra; /// Geryon device UCL_Device *dev; @@ -508,11 +545,12 @@ class Atom { bool _compiled; // True if data has been copied to device already - bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; + bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized; bool alloc(const int nall); bool _allocated, _rot, _charge, _bonds, _vel, _other; + int _extra_fields; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index e2b5b9cdb5..8908f3aff7 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -424,7 +424,7 @@ template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel) { + const bool vel, const int extra_fields) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) @@ -453,7 +453,7 @@ int DeviceT::init(Answer &ans, const bool charge, if (_init_count==0) { // Initialize atom and nbor data - if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel)) + if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields)) return -3; _data_in_estimate++; @@ -463,6 +463,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (vel) _data_in_estimate++; + if (extra_fields>0) + _data_in_estimate++; + } else { if (atom.charge()==false && charge) _data_in_estimate++; @@ -470,7 +473,9 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (atom.velocity()==false && vel) _data_in_estimate++; - if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel)) + if (atom.using_extra()==false && extra_fields>0) + _data_in_estimate++; + if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields)) return -3; } diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 1db6ae3127..01d3d64627 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -61,6 +61,7 @@ class Device { * \param nall Total number of local+ghost particles * \param maxspecial Maximum mumber of special bonded atoms per atom * \param vel True if velocities need to be stored + * \param extra_fields Nonzero if extra fields need to be stored * * Returns: * - 0 if successful @@ -70,7 +71,7 @@ class Device { * - -5 Double precision is not supported on card **/ int init(Answer &ans, const bool charge, const bool rot, const int nlocal, const int nall, const int maxspecial, - const bool vel=false); + const bool vel=false, const int extra_fields=0); /// Initialize the device for Atom storage only /** \param nlocal Total number of local particles to allocate memory for diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 12cf6345c2..2ef8af0911 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -330,6 +330,10 @@ #define NEIGHMASK 0x3FFFFFFF ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }; +#define SBBITS15 29 +#define NEIGHMASK15 0x1FFFFFFF +ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; }; + // default to 32-bit smallint and other ints, 64-bit bigint: // same as defined in src/lmptype.h #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \ From 88f3dd334c6d8d99fa1b35492f2a945637dfdce7 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 26 Aug 2021 09:35:43 -0500 Subject: [PATCH 004/181] Some changes in PPPMGPU due to the API changes in the GridComm class --- src/GPU/pppm_gpu.cpp | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index 8e3ec2ace8..476e54f8ca 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -106,8 +106,6 @@ PPPMGPU::PPPMGPU(LAMMPS *lmp) : PPPM(lmp) PPPMGPU::~PPPMGPU() { PPPM_GPU_API(clear)(poisson_time); - destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out); - destroy_3d_offset(vd_brick,nzlo_out,nylo_out); } /* ---------------------------------------------------------------------- @@ -257,12 +255,12 @@ void PPPMGPU::compute(int eflag, int vflag) // remap from 3d decomposition to FFT decomposition if (triclinic == 0) { - gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_GPU, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_GPU, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); brick2fft_gpu(); } else { - gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); PPPM::brick2fft(); } @@ -276,21 +274,21 @@ void PPPMGPU::compute(int eflag, int vflag) // to fill ghost cells surrounding their 3d bricks if (differentiation_flag == 1) - gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->forward_comm(this,1,sizeof(FFT_SCALAR),FORWARD_AD, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); else - gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->forward_comm(this,3,sizeof(FFT_SCALAR),FORWARD_IK, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); // extra per-atom energy/virial communication if (evflag_atom) { if (differentiation_flag == 1 && vflag_atom) - gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->forward_comm(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); else if (differentiation_flag == 0) - gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->forward_comm(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); } poisson_time += MPI_Wtime()-t3; @@ -833,8 +831,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag) density_brick = density_A_brick; density_fft = density_A_fft; - gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); brick2fft(); // group B @@ -842,8 +840,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag) density_brick = density_B_brick; density_fft = density_B_fft; - gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, - gc_buf1,gc_buf2,MPI_FFT_SCALAR); + gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); brick2fft(); // switch back pointers From 6a998fcb8e0bbea758ac5738f98ed11891db4f5c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 26 Aug 2021 11:17:49 -0500 Subject: [PATCH 005/181] Added fix store/state commands to the example input scripts --- examples/amoeba/in.ubiquitin | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index e6f9893e41..acb8b7fcb6 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -23,6 +23,15 @@ pair_coeff * * amoeba_ubiquitin.prm amoeba_ubiquitin.key special_bonds lj/coul 0.5 0.5 0.5 one/five yes +# setup force components this way so can dump them (AMOEBA or HIPPO also needs them for now) + +#fix fhal all store/state 0 fx fy fz +#fix frepulse all store/state 0 fx fy fz +#fix fdisp all store/state 0 fx fy fz +#fix fpolar all store/state 0 fx fy fz +#fix fmpole all store/state 0 fx fy fz +#fix fqxfer all store/state 0 fx fy fz + # thermo output compute virial all pressure NULL virial From 42048ee73fa6ae32fdfda7a47f6db3020691fd74 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 26 Aug 2021 11:23:21 -0500 Subject: [PATCH 006/181] Activated the fix store/state commands in one of the example input scripts --- examples/amoeba/in.ubiquitin | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index acb8b7fcb6..7f0b653350 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -25,12 +25,12 @@ special_bonds lj/coul 0.5 0.5 0.5 one/five yes # setup force components this way so can dump them (AMOEBA or HIPPO also needs them for now) -#fix fhal all store/state 0 fx fy fz -#fix frepulse all store/state 0 fx fy fz -#fix fdisp all store/state 0 fx fy fz -#fix fpolar all store/state 0 fx fy fz -#fix fmpole all store/state 0 fx fy fz -#fix fqxfer all store/state 0 fx fy fz +fix fhal all store/state 0 fx fy fz +fix frepulse all store/state 0 fx fy fz +fix fdisp all store/state 0 fx fy fz +fix fpolar all store/state 0 fx fy fz +fix fmpole all store/state 0 fx fy fz +fix fqxfer all store/state 0 fx fy fz # thermo output From 5ffae6ed23171e4bd9f366c0511575904b4558bd Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 30 Aug 2021 09:14:46 -0500 Subject: [PATCH 007/181] Limited to neigh yes for amoeba/gpu for now --- src/GPU/pair_amoeba_gpu.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 4f1b20d364..09ba100e4e 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -230,13 +230,6 @@ void PairAmoebaGPU::init_style() { PairAmoeba::init_style(); - if (gpu_mode == GPU_FORCE) { - if (comm->me == 0) - error->warning(FLERR,"Pair style amoeba/gpu does not support neigh no " - "for now, automatically switching to neigh yes"); - gpu_mode = GPU_NEIGH; - } - // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -284,6 +277,9 @@ void PairAmoebaGPU::init_style() tep_size); GPU_EXTRA::check_flag(success,error,world); + if (gpu_mode == GPU_FORCE) + error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); + if (tep_size == sizeof(double)) tep_single = false; else From 07b60827c459e7ea57b7e351d1e064e27090c9ef Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 1 Sep 2021 12:30:41 -0500 Subject: [PATCH 008/181] Working on the udirect2b kernel for the induce real space term, need to add the API for the GPU library --- lib/gpu/lal_amoeba.cpp | 5 +- lib/gpu/lal_amoeba.cu | 258 +++++++++++++++++++++++++++++++++++++ lib/gpu/lal_amoeba.h | 3 +- lib/gpu/lal_amoeba_ext.cpp | 5 +- 4 files changed, 266 insertions(+), 5 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 67f0877e1a..a3bd653efd 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -45,7 +45,8 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const { template int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_special_polar_wscale, + const double *host_thole, const double *host_dirdamp, + const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const int nlocal, const int nall, const int max_nbors, @@ -76,7 +77,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda for (int i = 0; i < max_amtype; i++) { host_write[i].x = host_pdamp[i]; host_write[i].y = host_thole[i]; - host_write[i].z = (numtyp)0; + host_write[i].z = host_dirdamp[i]; host_write[i].w = (numtyp)0; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index fbda1e0787..1f5fb42438 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -91,6 +91,37 @@ _texture( q_tex,int2); tep[i]=t; \ } +#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \ + i, field, fieldp) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=_fieldp[0]; \ + red_acc[1][tid]=_fieldp[1]; \ + red_acc[2][tid]=_fieldp[2]; \ + red_acc[3][tid]=_fieldp[3]; \ + red_acc[4][tid]=_fieldp[4]; \ + red_acc[5][tid]=_fieldp[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + _fieldp[0]=red_acc[0][tid]; \ + _fieldp[1]=red_acc[1][tid]; \ + _fieldp[2]=red_acc[2][tid]; \ + _fieldp[3]=red_acc[3][tid]; \ + _fieldp[4]=red_acc[4][tid]; \ + _fieldp[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ + _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom); \ + _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom); \ + _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom); \ + _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom); \ + _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = felec * rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find the field components for Thole polarization damping + + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp scale7 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + if (damp < (numtyp)50.0) { + expdamp = ucl_exp(-damp) ; + scale3 = (numtyp)1.0 - expdamp ; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); + } + } else { + pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,3.0); + if (damp < (numtyp)50.0) { + expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_dscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + scalek = factor_pscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + + } // nbor + + } // ii { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_special_polar_wscale, + const double *host_thole, const double *host_dirdamp, + const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const int nlocal, const int nall, const int max_nbors, diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 27c35a810f..a7959ed93e 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -29,6 +29,7 @@ static Amoeba AMOEBAMF; // --------------------------------------------------------------------------- int amoeba_gpu_init(const int ntypes, const int max_amtype, const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -62,7 +63,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, int init_ok=0; if (world_me==0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, + init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, @@ -82,7 +83,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, + init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, From 785a794d3933c56d495d2970518ab653c8d1ba6c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 1 Sep 2021 14:37:11 -0500 Subject: [PATCH 009/181] Added and renamed API to make room for additional kernels (udirect2b only computes the field and fieldp, not accumulating forces, energies, nor virials) --- lib/gpu/lal_amoeba.cpp | 35 ++++- lib/gpu/lal_amoeba.cu | 20 +-- lib/gpu/lal_amoeba.h | 5 +- lib/gpu/lal_amoeba_ext.cpp | 31 ++-- lib/gpu/lal_base_amoeba.cpp | 155 ++++++++++++++++---- lib/gpu/lal_base_amoeba.h | 24 ++- src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 283 ++++++++++++++++++++++++++++++------ src/GPU/pair_amoeba_gpu.h | 4 + 9 files changed, 448 insertions(+), 111 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index a3bd653efd..c7b4872db0 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -125,10 +125,10 @@ double AmoebaT::host_memory_usage() const { } // --------------------------------------------------------------------------- -// Calculate energies, forces, and torques +// Calculate the polar real-space term, returning tep // --------------------------------------------------------------------------- template -int AmoebaT::loop(const int eflag, const int vflag) { +int AmoebaT::polar_real(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/ @@ -140,9 +140,7 @@ int AmoebaT::loop(const int eflag, const int vflag) { this->time_pair.start(); this->k_polar.set_size(GX,BX); - - this->k_polar.run(&this->atom->x, &this->atom->extra, - &damping, &sp_polar, + this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, @@ -152,5 +150,32 @@ int AmoebaT::loop(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Calculate the polar real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::udirect2b(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int _nall=this->atom->nall(); + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); +/* + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, + &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); +*/ + this->time_pair.stop(); + return GX; +} + template class Amoeba; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 1f5fb42438..3d28939d42 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -715,11 +715,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, //numtyp4 xi__; if (ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + numtyp aesq2 = (numtyp)2.0 * aewald*aewald; + numtyp aesq2n = (numtyp)0.0; + if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); for ( ; nbor { numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale; numtyp _qqrd2e; - private: + protected: bool _allocated; - int loop(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + int udirect2b(const int eflag, const int vflag); }; } diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index a7959ed93e..9fa3c7f75b 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -105,7 +105,7 @@ void amoeba_gpu_clear() { AMOEBAMF.clear(); } -int** amoeba_gpu_compute_n(const int ago, const int inum_full, +int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -116,7 +116,7 @@ int** amoeba_gpu_compute_n(const int ago, const int inum_full, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - return AMOEBAMF.compute(ago, inum_full, nall, host_x, host_type, + return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, @@ -124,18 +124,21 @@ int** amoeba_gpu_compute_n(const int ago, const int inum_full, host_q, boxlo, prd, tep_ptr); } -void amoeba_gpu_compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *host_q, - const int nlocal, double *boxlo, double *prd, void **tep_ptr) { - AMOEBAMF.compute(ago,inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, - ilist, numj, firstneigh, eflag, vflag, eatom, vatom, - host_start, cpu_time, success, host_q, nlocal, boxlo, prd, tep_ptr); +int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void **fieldp_ptr) { + return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, host_q, boxlo, prd, fieldp_ptr); } double amoeba_gpu_bytes() { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index c5f4a01222..0c9a422cec 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -118,8 +118,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, if (ef_nall==0) ef_nall=2000; - _max_tep_size=static_cast(static_cast(ef_nall)*1.10); - _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _max_alloc_size=static_cast(static_cast(ef_nall)*1.10); + _fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); @@ -149,6 +150,7 @@ void BaseAmoebaT::clear_atomic() { ans->clear(); _tep.clear(); + _fieldp.clear(); dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -250,9 +252,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, // ------------------- Resize _tep array ------------------------ - if (nall>_max_tep_size) { - _max_tep_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_tep_size*4); + if (nall>_max_alloc_size) { + _max_alloc_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_alloc_size*4); dev_nspecial15.clear(); dev_special15.clear(); @@ -296,17 +298,17 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, boxlo, prd); - const int red_blocks=loop(eflag,vflag); + const int red_blocks=polar_real(eflag,vflag); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies +// Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall, +int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -336,9 +338,9 @@ int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall, // ------------------- Resize _tep array ------------------------ - if (nall>_max_tep_size) { - _max_tep_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_tep_size*4); + if (nall>_max_alloc_size) { + _max_alloc_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_alloc_size*4); dev_nspecial15.clear(); dev_special15.clear(); @@ -388,16 +390,16 @@ int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall, device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, boxlo, prd); - const int red_blocks=loop(eflag,vflag); + const int red_blocks=polar_real(eflag,vflag); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); // copy tep from device to host - _tep.update_host(_max_tep_size*4,false); + _tep.update_host(_max_alloc_size*4,false); /* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size); for (int i = 0; i < 10; i++) { numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); @@ -406,6 +408,101 @@ int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall, return nbor->host_jlist.begin()-host_start; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the permanent field +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void** fieldp_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // ------------------- Resize _fieldp array ------------------------ + + if (nall>_max_alloc_size) { + _max_alloc_size=static_cast(static_cast(nall)*1.10); + _fieldp.resize(_max_alloc_size*8); + + dev_nspecial15.clear(); + dev_special15.clear(); + dev_special15_t.clear(); + dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + } + *fieldp_ptr=_fieldp.host.begin(); + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + success); + if (!success) + return nullptr; + atom->cast_q_data(host_q); + cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_q_data(); + atom->add_extra_data(); + + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + const int red_blocks=udirect2b(eflag,vflag); + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + hd_balancer.stop_timer(); + + // copy field and fieldp from device to host + + //_fieldp.update_host(_max_field_size*8,false); + + return nbor->host_jlist.begin()-host_start; +} + template double BaseAmoebaT::host_memory_usage_atomic() const { return device->atom.host_memory_usage()+nbor->host_memory_usage()+ @@ -446,20 +543,24 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, pextra[idx+3] = (numtyp)amgroup[i]; } - n += nstride*_nall; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = uind[i][0]; - pextra[idx+1] = uind[i][1]; - pextra[idx+2] = uind[i][2]; + if (uind) { + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = uind[i][0]; + pextra[idx+1] = uind[i][1]; + pextra[idx+2] = uind[i][2]; + } } - - n += nstride*_nall; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = uinp[i][0]; - pextra[idx+1] = uinp[i][1]; - pextra[idx+2] = uinp[i][2]; + + if (uinp) { + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = uinp[i][0]; + pextra[idx+1] = uinp[i][1]; + pextra[idx+2] = uinp[i][2]; + } } } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index ac9c23e8a9..7ef94c776e 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -128,7 +128,7 @@ class BaseAmoeba { tagint **special, int *nspecial15, tagint **special15, bool &success); - /// Pair loop with host neighboring + /// Compute polar real-space with host neighboring (not active for now) void compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, @@ -138,8 +138,8 @@ class BaseAmoeba { const double cpu_time, bool &success, double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr); - /// Pair loop with device neighboring - int** compute(const int ago, const int inum_full, const int nall, + /// Compute polar real-space with device neighboring + int** compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, @@ -150,6 +150,17 @@ class BaseAmoeba { int **ilist, int **numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd, void **tep_ptr); + /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring + int** compute_udirect2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, void **fieldp_ptr); + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage @@ -179,8 +190,8 @@ class BaseAmoeba { double** uind, double** uinp); /// Per-atom arrays - UCL_Vector _tep; - int _max_tep_size; + UCL_Vector _tep,_fieldp; + int _max_alloc_size; // ------------------------ FORCE/ENERGY DATA ----------------------- @@ -217,7 +228,8 @@ class BaseAmoeba { void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); - virtual int loop(const int eflag, const int vflag) = 0; + virtual int polar_real(const int eflag, const int vflag) = 0; + virtual int udirect2b(const int eflag, const int vflag) = 0; }; } diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 4644d4a137..9d23fccdd8 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -369,7 +369,7 @@ class PairAmoeba : public Pair { void umutual1(double **, double **); void umutual2b(double **, double **); void udirect1(double **); - void udirect2b(double **, double **); + virtual void udirect2b(double **, double **); void dampmut(double, double, double, double *); void dampdir(double, double, double, double *, double *); void cholesky(int, double *, double *); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 09ba100e4e..3f4e72c0af 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -24,19 +24,24 @@ #include "error.h" #include "force.h" #include "gpu_extra.h" +#include "math_const.h" +#include "my_page.h" #include "neigh_list.h" #include "neigh_request.h" #include "neighbor.h" #include "suffix.h" - #include using namespace LAMMPS_NS; +using namespace MathConst; + +enum{MUTUAL,OPT,TCG,DIRECT}; // External functions from cuda library for atom decomposition int amoeba_gpu_init(const int ntypes, const int max_amtype, const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -48,7 +53,17 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const double polar_uscale, int& tep_size); void amoeba_gpu_clear(); -int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall, +int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd, + void **fieldp_ptr); + +int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, @@ -58,15 +73,6 @@ int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd, void **tep_ptr); -void amoeba_gpu_compute(const int ago, const int inum, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, const int nlocal, - double *boxlo, double *prd, void **tep_ptr); double amoeba_gpu_bytes(); @@ -80,6 +86,8 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) reinitflag = 0; cpu_time = 0.0; suffix_flag |= Suffix::GPU; + fieldp_pinned = nullptr; + tep_pinned = nullptr; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -102,42 +110,31 @@ void PairAmoebaGPU::polar_real() bool success = true; int *ilist, *numneigh, **firstneigh; - if (gpu_mode != GPU_FORCE) { - double sublo[3],subhi[3]; - if (domain->triclinic == 0) { - sublo[0] = domain->sublo[0]; - sublo[1] = domain->sublo[1]; - sublo[2] = domain->sublo[2]; - subhi[0] = domain->subhi[0]; - subhi[1] = domain->subhi[1]; - subhi[2] = domain->subhi[2]; - } else { - domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); - } - inum = atom->nlocal; - - firstneigh = amoeba_gpu_compute_n(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, - rpole, uind, uinp, sublo, subhi, - atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, - domain->prd, &tep_pinned); - + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; } else { - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - - amoeba_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, - amtype, amgroup, rpole, uind, uinp, - ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, - vflag_atom, host_start, cpu_time, success, atom->q, - atom->nlocal, domain->boxlo, domain->prd, &tep_pinned); + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } + inum = atom->nlocal; + + firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, + rpole, uind, uinp, sublo, subhi, + atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd, &tep_pinned); + + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -248,6 +245,7 @@ void PairAmoebaGPU::init_style() } // select the cutoff (off2) for neighbor list builds (the polar term for now) + // NOTE: induce and polar terms are using the same flags here if (use_ewald) choose(POLAR_LONG); else choose(POLAR); @@ -268,7 +266,7 @@ void PairAmoebaGPU::init_style() double felec = 0.5 * electric / am_dielectric; - int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, special_polar_wscale, special_polar_piscale, special_polar_pscale, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, @@ -286,6 +284,199 @@ void PairAmoebaGPU::init_style() tep_single = true; } +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::udirect2b(double **field, double **fieldp) +{ + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, sublo, + subhi, atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd, &fieldp_pinned); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + udirect2b_cpu(); +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::udirect2b_cpu() +{ + int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup; + double xr,yr,zr,r,r2; + double rr1,rr2,rr3,rr5; + double bfac,exp2a; + double ralpha,aefac; + double aesq2,aesq2n; + double pdi,pti,ddi; + double pgamma; + double damp,expdamp; + double scale3,scale5; + double scale7,scalek; + double bn[4],bcn[3]; + double factor_dscale,factor_pscale,factor_uscale,factor_wscale; + + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + // launching the kernel to compute field and fieldp + + // amoeba_gpu_compute_field(...); + + double **x = atom->x; + + // neigh list + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // NOTE: doesn't this have a problem if aewald is tiny ?? + + aesq2 = 2.0 * aewald * aewald; + aesq2n = 0.0; + if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + int *neighptr; + double *tdipdip; + + // compute the real space portion of the Ewald summation + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + itype = amtype[i]; + igroup = amgroup[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + n = ndip = 0; + neighptr = ipage_dipole->vget(); + tdipdip = dpage_dipdip->vget(); + + pdi = pdamp[itype]; + pti = thole[itype]; + ddi = dirdamp[itype]; + + // evaluate all sites within the cutoff distance + + for (jj = 0; jj < jnum; jj++) { + jextra = jlist[jj]; + j = jextra & NEIGHMASK15; + + xr = x[j][0] - x[i][0]; + yr = x[j][1] - x[i][1]; + zr = x[j][2] - x[i][2]; + r2 = xr*xr + yr* yr + zr*zr; + if (r2 > off2) continue; + + jtype = amtype[j]; + jgroup = amgroup[j]; + + factor_wscale = special_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = special_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = special_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = 1.0; + } + + r = sqrt(r2); + rr1 = 1.0 / r; + rr2 = rr1 * rr1; + rr3 = rr2 * rr1; + rr5 = 3.0 * rr2 * rr3; + + // calculate the real space Ewald error function terms + + ralpha = aewald * r; + bn[0] = erfc(ralpha) * rr1; + exp2a = exp(-ralpha*ralpha); + aefac = aesq2n; + for (m = 1; m <= 3; m++) { + bfac = m+m-1; + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; + } + + // find terms needed later to compute mutual polarization + + if (poltyp != DIRECT) { + scale3 = 1.0; + scale5 = 1.0; + damp = pdi * pdamp[jtype]; + if (damp != 0.0) { + pgamma = MIN(pti,thole[jtype]); + damp = pgamma * pow(r/damp,3.0); + if (damp < 50.0) { + expdamp = exp(-damp); + scale3 = 1.0 - expdamp; + scale5 = 1.0 - expdamp*(1.0+damp); + } + } + scalek = factor_uscale; + bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; + + neighptr[n++] = j; + tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[ndip++] = bcn[1]*xr*yr; + tdipdip[ndip++] = bcn[1]*xr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[ndip++] = bcn[1]*yr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; + } + + } // jj + + firstneigh_dipole[i] = neighptr; + firstneigh_dipdip[i] = tdipdip; + numneigh_dipole[i] = n; + ipage_dipole->vgot(n); + dpage_dipdip->vgot(ndip); + } +} + /* ---------------------------------------------------------------------- */ double PairAmoebaGPU::memory_usage() diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 4d29bfaf34..e5d4aab176 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -34,13 +34,17 @@ class PairAmoebaGPU : public PairAmoeba { enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; virtual void polar_real(); + virtual void udirect2b(double **, double **); private: int gpu_mode; double cpu_time; void *tep_pinned; + void *fieldp_pinned; bool tep_single; + void udirect2b_cpu(); + template void compute_force_from_tep(const numtyp*); }; From 7e0c77f1cbb38d98d03423a3d3ff0efb8ccd7b41 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 1 Sep 2021 14:51:36 -0500 Subject: [PATCH 010/181] Added fallback flags to indicate which terms are ready from the GPU lib --- src/GPU/pair_amoeba_gpu.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3f4e72c0af..3cdaa25633 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -104,6 +104,12 @@ PairAmoebaGPU::~PairAmoebaGPU() void PairAmoebaGPU::polar_real() { + bool gpu_polar_real_ready = true; + if (!gpu_polar_real_ready) { + PairAmoeba::polar_real(); + return; + } + int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; int inum, host_start; @@ -292,6 +298,12 @@ void PairAmoebaGPU::init_style() void PairAmoebaGPU::udirect2b(double **field, double **fieldp) { + bool gpu_udirect2b_ready = false; + if (!gpu_udirect2b_ready) { + PairAmoeba::udirect2b(field, fieldp); + return; + } + int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; int inum, host_start; @@ -354,10 +366,6 @@ void PairAmoebaGPU::udirect2b_cpu() int inum,jnum; int *ilist,*jlist,*numneigh,**firstneigh; - // launching the kernel to compute field and fieldp - - // amoeba_gpu_compute_field(...); - double **x = atom->x; // neigh list From 745c7089f0cf40162e6b790726f1766cd588379f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 3 Sep 2021 01:00:29 -0500 Subject: [PATCH 011/181] Temporarily commented out the section in the Atom class where FixGPU finds the optimal bin size. This section makes ev_tally4() in Angle different from CPU-only runs, even with a single command "package gpu 1" without any gpu pair style. Need more effort to understand why. --- src/GPU/fix_gpu.cpp | 2 ++ src/atom.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 66b938c577..51f36defdc 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -386,6 +386,8 @@ double FixGPU::memory_usage() return bytes; } +/* ---------------------------------------------------------------------- */ + double FixGPU::binsize(const double subx, const double suby, const double subz, const int nlocal, const double cut) { diff --git a/src/atom.cpp b/src/atom.cpp index 86e2b1151b..4ad5110ec9 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -2274,6 +2274,7 @@ void Atom::setup_sort_bins() #ifdef LMP_GPU if (userbinsize == 0.0) { int ifix = modify->find_fix("package_gpu"); +/* if (ifix >= 0) { const double subx = domain->subhi[0] - domain->sublo[0]; const double suby = domain->subhi[1] - domain->sublo[1]; @@ -2297,6 +2298,7 @@ void Atom::setup_sort_bins() bininvy = bininv; bininvz = bininv; } +*/ } #endif From 7d69a870a4f4bfcab8dce6b0d6460ac1cde41a5d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 3 Sep 2021 13:43:22 -0500 Subject: [PATCH 012/181] Reverted the binsize function call from the GPU package in Atom, instead added atom_modify sort with a binsize to ensure matching virial values, enabled the udirect2b kernel, need more work to override dfield0c, and induce() to bypass reverse_comm() for field and fieldp (line amoeba_induce.cpp:111-112) --- examples/amoeba/in.ubiquitin | 2 +- lib/gpu/lal_amoeba.cpp | 20 +++++------ lib/gpu/lal_amoeba.cu | 55 ++++++++++++++++++------------- lib/gpu/lal_base_amoeba.cpp | 64 +++++++++++++++++++++++------------- lib/gpu/lal_base_amoeba.h | 15 +++++---- src/GPU/pair_amoeba_gpu.cpp | 25 ++++++++++++-- src/atom.cpp | 2 -- 7 files changed, 115 insertions(+), 68 deletions(-) diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index e02d849ba4..2491493c45 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -4,7 +4,7 @@ units real boundary p p p atom_style amoeba - +#atom_modify sort 1000 7.0 bond_style class2 angle_style amoeba dihedral_style none diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index c7b4872db0..0d78a8618a 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -57,7 +57,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda const double polar_uscale) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, - cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar"); + cell_size,gpu_split,_screen,amoeba, + "k_amoeba_polar", "k_amoeba_udirect2b"); if (success!=0) return success; @@ -164,15 +165,14 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); -/* - this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &this->_tep, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, - &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); -*/ + + this->k_udirect2b.set_size(GX,BX); + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &_aewald, &_off2, + &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); return GX; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 3d28939d42..adcff0e648 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -91,8 +91,8 @@ _texture( q_tex,int2); tep[i]=t; \ } -#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \ - i, field, fieldp) \ +#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i, \ + fieldp) \ if (t_per_atom>1) { \ red_acc[0][tid]=_fieldp[0]; \ red_acc[1][tid]=_fieldp[1]; \ @@ -118,8 +118,8 @@ _texture( q_tex,int2); numtyp4 f, fp; \ f.x = _fieldp[0]; f.y = _fieldp[0]; f.z = _fieldp[2]; \ fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5]; \ - field[i] = f; \ - fieldp[i] = fp; \ + fieldp[ii] = f; \ + fieldp[ii+inum] = fp; \ } #else @@ -152,8 +152,8 @@ _texture( q_tex,int2); tep[i]=t; \ } -#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \ - i, field, fieldp) \ +#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i, \ + fieldp) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ @@ -168,8 +168,8 @@ _texture( q_tex,int2); numtyp4 f, fp; \ f.x = _fieldp[0]; f.y = _fieldp[0]; f.z = _fieldp[2]; \ fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5]; \ - field[i] = f; \ - fieldp[i] = fp; \ + fieldp[ii] = f; \ + fieldp[ii+inum] = fp; \ } #endif @@ -177,6 +177,11 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ + __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict damping, @@ -468,7 +473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0]; term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr; numtyp tixx = ci*term3 + dix*term4 + dir*term5 + - (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6; + (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 + (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; @@ -684,19 +689,23 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, offset,eflag,vflag,ans,engv); } +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict damping, - const __global numtyp4 *restrict sp_polar, - const __global int *dev_nbor, - const __global int *dev_packed, - __global numtyp4 *restrict field, - __global numtyp4 *restrict fieldp, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp felec, - const numtyp off2, const numtyp polar_dscale, - const numtyp polar_uscale) + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + __global numtyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -771,7 +780,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; - numtyp rr1 = felec * rinv; + numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; @@ -888,7 +897,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, // accumulate field and fieldp - store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,field,fieldp); + store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp); } /* ---------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 0c9a422cec..a1cf516777 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() { delete ans; delete nbor; k_polar.clear(); + k_udirect2b.clear(); k_special15.clear(); if (pair_program) delete pair_program; } @@ -53,7 +54,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name) { + const char *k_name_polar, + const char *k_name_udirect2b) { screen=_screen; int gpu_nbor=0; @@ -85,7 +87,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name); + compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -118,9 +120,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, if (ef_nall==0) ef_nall=2000; - _max_alloc_size=static_cast(static_cast(ef_nall)*1.10); - _fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _max_tep_size=static_cast(static_cast(ef_nall)*1.10); + _max_fieldp_size = _max_tep_size; + _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); @@ -224,7 +227,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, +void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -252,9 +255,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, // ------------------- Resize _tep array ------------------------ - if (nall>_max_alloc_size) { - _max_alloc_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_alloc_size*4); + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); dev_nspecial15.clear(); dev_special15.clear(); @@ -302,6 +305,10 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); + + // copy tep from device to host + + _tep.update_host(_max_tep_size*4,false); } // --------------------------------------------------------------------------- @@ -338,9 +345,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const // ------------------- Resize _tep array ------------------------ - if (nall>_max_alloc_size) { - _max_alloc_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_alloc_size*4); + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); dev_nspecial15.clear(); dev_special15.clear(); @@ -397,9 +404,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const // copy tep from device to host - _tep.update_host(_max_alloc_size*4,false); + _tep.update_host(_max_tep_size*4,false); /* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size); + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); for (int i = 0; i < 10; i++) { numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); @@ -442,9 +449,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i // ------------------- Resize _fieldp array ------------------------ - if (nall>_max_alloc_size) { - _max_alloc_size=static_cast(static_cast(nall)*1.10); - _fieldp.resize(_max_alloc_size*8); + if (nall>_max_fieldp_size) { + _max_fieldp_size=static_cast(static_cast(nall)*1.10); + _fieldp.resize(_max_fieldp_size*8); dev_nspecial15.clear(); dev_special15.clear(); @@ -492,13 +499,18 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i *jnum=nbor->host_acc.begin(); const int red_blocks=udirect2b(eflag,vflag); - //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //device->add_ans_object(ans); hd_balancer.stop_timer(); - // copy field and fieldp from device to host + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) - //_fieldp.update_host(_max_field_size*8,false); + _fieldp.update_host(_max_fieldp_size*8,false); +/* + printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_field.cols(), _max_fieldp_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); + printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ return nbor->host_jlist.begin()-host_start; } @@ -566,7 +578,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname) { + const char *kname_polar, + const char *kname_udirect2b) { if (_compiled) return; @@ -575,7 +588,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, std::string oclstring = device->compile_string()+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); - k_polar.set_function(*pair_program,kname); + k_polar.set_function(*pair_program,kname_polar); + k_udirect2b.set_function(*pair_program,kname_udirect2b); k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); @@ -593,6 +607,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, } +// --------------------------------------------------------------------------- +// Specify 1-5 neighbors from the current neighbor list +// --------------------------------------------------------------------------- + template int BaseAmoebaT::add_onefive_neighbors() { // Compute the block size and grid size to keep all cores busy diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 7ef94c776e..ae0f33ef29 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -53,8 +53,8 @@ class BaseAmoeba { * - -5 Double precision is not supported on card **/ int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, - const double gpu_split, FILE *screen, - const void *pair_program, const char *k_name); + const double gpu_split, FILE *screen, const void *pair_program, + const char *kname_polar, const char *kname_udirect2b); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -129,7 +129,7 @@ class BaseAmoeba { bool &success); /// Compute polar real-space with host neighboring (not active for now) - void compute(const int f_ago, const int inum_full, const int nall, + void compute_polar_real(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, int *ilist, int *numj, @@ -190,8 +190,8 @@ class BaseAmoeba { double** uind, double** uinp); /// Per-atom arrays - UCL_Vector _tep,_fieldp; - int _max_alloc_size; + UCL_Vector _tep, _fieldp; + int _max_tep_size, _max_fieldp_size; // ------------------------ FORCE/ENERGY DATA ----------------------- @@ -210,7 +210,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_polar,k_special15; + UCL_Kernel k_polar, k_udirect2b, k_special15; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -226,7 +226,8 @@ class BaseAmoeba { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *kname_polar, const char *kname_udirect2b); virtual int polar_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3cdaa25633..a5cc86e39d 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -298,7 +298,7 @@ void PairAmoebaGPU::init_style() void PairAmoebaGPU::udirect2b(double **field, double **fieldp) { - bool gpu_udirect2b_ready = false; + bool gpu_udirect2b_ready = true; if (!gpu_udirect2b_ready) { PairAmoeba::udirect2b(field, fieldp); return; @@ -334,7 +334,28 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); - + + // get field and fieldp values from the GPU lib + + int nlocal = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] = field_ptr[idx]; + field[i][1] = field_ptr[idx+1]; + field[i][2] = field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] = fieldp_ptr[idx]; + fieldp[i][1] = fieldp_ptr[idx+1]; + fieldp[i][2] = fieldp_ptr[idx+2]; + } + // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs diff --git a/src/atom.cpp b/src/atom.cpp index 4ad5110ec9..86e2b1151b 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -2274,7 +2274,6 @@ void Atom::setup_sort_bins() #ifdef LMP_GPU if (userbinsize == 0.0) { int ifix = modify->find_fix("package_gpu"); -/* if (ifix >= 0) { const double subx = domain->subhi[0] - domain->sublo[0]; const double suby = domain->subhi[1] - domain->sublo[1]; @@ -2298,7 +2297,6 @@ void Atom::setup_sort_bins() bininvy = bininv; bininvz = bininv; } -*/ } #endif From 8f5f65e68da92c7649a7d0444b6630816db37ff5 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 3 Sep 2021 16:42:58 -0500 Subject: [PATCH 013/181] Declared virtual to relevant functions in PairAmoeba, added the overridden versions in PairAmoebaGPU --- examples/amoeba/in.ubiquitin | 2 +- src/AMOEBA/pair_amoeba.h | 4 +- src/GPU/pair_amoeba_gpu.cpp | 529 +++++++++++++++++++++++++++++++++-- src/GPU/pair_amoeba_gpu.h | 2 + 4 files changed, 514 insertions(+), 23 deletions(-) diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index 2491493c45..f017d8f122 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -4,7 +4,7 @@ units real boundary p p p atom_style amoeba -#atom_modify sort 1000 7.0 +atom_modify sort 1000 7.0 bond_style class2 angle_style amoeba dihedral_style none diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 9d23fccdd8..0ec601de47 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -361,11 +361,11 @@ class PairAmoeba : public Pair { void polar_kspace(); void damppole(double, int, double, double, double *, double *, double *); - void induce(); + virtual void induce(); void ulspred(); void ufield0c(double **, double **); void uscale0b(int, double **, double **, double **, double **); - void dfield0c(double **, double **); + virtual void dfield0c(double **, double **); void umutual1(double **, double **); void umutual2b(double **, double **); void udirect1(double **); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index a5cc86e39d..f2ba3acceb 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -18,13 +18,16 @@ #include "pair_amoeba_gpu.h" +#include "amoeba_convolution.h" #include "atom.h" #include "comm.h" #include "domain.h" #include "error.h" +#include "fix_store.h" #include "force.h" #include "gpu_extra.h" #include "math_const.h" +#include "memory.h" #include "my_page.h" #include "neigh_list.h" #include "neigh_request.h" @@ -35,7 +38,15 @@ using namespace LAMMPS_NS; using namespace MathConst; +enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm +enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm +enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; enum{MUTUAL,OPT,TCG,DIRECT}; +enum{GEAR,ASPC,LSQR}; +enum{BUILD,APPLY}; +enum{GORDON1,GORDON2}; + +#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye // External functions from cuda library for atom decomposition @@ -54,30 +65,28 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, void amoeba_gpu_clear(); int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd, - void **fieldp_ptr); + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd, + void **fieldp_ptr); int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd, - void **tep_ptr); + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd, + void **tep_ptr); double amoeba_gpu_bytes(); -enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; - /* ---------------------------------------------------------------------- */ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) @@ -290,6 +299,486 @@ void PairAmoebaGPU::init_style() tep_single = true; } +/* ---------------------------------------------------------------------- + induce = induced dipole moments via pre-conditioned CG solver + adapted from Tinker induce0a() routine +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::induce() +{ + bool done; + int i,j,m,ii,itype; + int iter,maxiter; + double polmin; + double eps,epsold; + double epsd,epsp; + double udsum,upsum; + double a,ap,b,bp; + double sum,sump,term; + double reduce[4],allreduce[4]; + + double *poli; + double **conj,**conjp; + double **vec,**vecp; + double **udir,**usum,**usump; + + int debug = 1; + + // set cutoffs, taper coeffs, and PME params + // create qfac here, free at end of polar() + + if (use_ewald) { + choose(POLAR_LONG); + int nmine = p_kspace->nfft_owned; + memory->create(qfac,nmine,"ameoba/induce:qfac"); + } else choose(POLAR); + + // owned atoms + + double **x = atom->x; + double **f = atom->f; + int nlocal = atom->nlocal; + + // zero out the induced dipoles at each site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + } + } + + // allocation of arrays + // NOTE: not all are used by all methods + // NOTE: could be re-allocated dynamically + + memory->create(poli,nlocal,"ameoba/induce:poli"); + memory->create(conj,nlocal,3,"ameoba/induce:conj"); + memory->create(conjp,nlocal,3,"ameoba/induce:conjp"); + memory->create(vec,nlocal,3,"ameoba/induce:vec"); + memory->create(vecp,nlocal,3,"ameoba/induce:vecp"); + memory->create(udir,nlocal,3,"ameoba/induce:udir"); + memory->create(usum,nlocal,3,"ameoba/induce:usum"); + memory->create(usump,nlocal,3,"ameoba/induce:usump"); + + // get the electrostatic field due to permanent multipoles + + dfield0c(field,fieldp); + + // reverse comm to sum field,fieldp from ghost atoms to owned atoms + + crstyle = FIELD; + comm->reverse_comm_pair(this); + + // DEBUG statements + + /* + for (i = 0; i < nlocal; i++) + if (atom->tag[i] == 1) + printf("AAA FIELD atom %d: field %g %g %g: fieldp %g %g %g\n", + atom->tag[i], + field[i][0],field[i][1],field[i][2], + fieldp[i][0],fieldp[i][1],fieldp[i][2]); + */ + + // set induced dipoles to polarizability times direct field + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + udir[i][j] = polarity[itype] * field[i][j]; + udirp[i][j] = polarity[itype] * fieldp[i][j]; + if (pcgguess) { + uind[i][j] = udir[i][j]; + uinp[i][j] = udirp[i][j]; + } + } + } + + // get induced dipoles via the OPT extrapolation method + // NOTE: any way to rewrite these loops to avoid allocating + // uopt,uoptp with a optorder+1 dimension, just optorder ?? + // since no need to store optorder+1 values after these loops + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uopt[i][0][j] = udir[i][j]; + uoptp[i][0][j] = udirp[i][j]; + } + } + + for (m = 1; m <= optorder; m++) { + optlevel = m - 1; // used in umutual1() for fopt,foptp + + cfstyle = INDUCE; + comm->forward_comm_pair(this); + + ufield0c(field,fieldp); + + crstyle = FIELD; + comm->reverse_comm_pair(this); + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + uopt[i][m][j] = polarity[itype] * field[i][j]; + uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; + uind[i][j] = uopt[i][m][j]; + uinp[i][j] = uoptp[i][m][j]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + usum[i][j] = 0.0; + usump[i][j] = 0.0; + for (m = 0; m <= optorder; m++) { + usum[i][j] += uopt[i][m][j]; + usump[i][j] += uoptp[i][m][j]; + uind[i][j] += copt[m]*usum[i][j]; + uinp[i][j] += copt[m]*usump[i][j]; + } + } + } + } + + // set tolerances for computation of mutual induced dipoles + + if (poltyp == MUTUAL) { + done = false; + maxiter = 100; + iter = 0; + polmin = 0.00000001; + eps = 100.0; + + // estimate induced dipoles using a polynomial predictor + + if (use_pred && nualt == maxualt) { + ulspred(); + + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + udsum = 0.0; + upsum = 0.0; + for (m = 0; m < nualt; m++) { + udsum += bpred[m]*udalt[i][m][j]; + upsum += bpredp[m]*upalt[i][m][j]; + } + uind[i][j] = udsum; + uinp[i][j] = upsum; + } + } + } + + // estimate induced dipoles via inertial extended Lagrangian + // not supported for now + // requires uaux,upaux to persist with each atom + // also requires a velocity vector(s) to persist + // also requires updating uaux,upaux in the Verlet integration + + /* + if (use_ielscf) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uaux[i][j]; + uinp[i][j] = upaux[i][j]; + } + } + } + */ + + // get the electrostatic field due to induced dipoles + + cfstyle = INDUCE; + comm->forward_comm_pair(this); + + ufield0c(field,fieldp); + + crstyle = FIELD; + comm->reverse_comm_pair(this); + + + // set initial conjugate gradient residual and conjugate vector + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + + poli[i] = MAX(polmin,polarity[itype]); + for (j = 0; j < 3; j++) { + if (pcgguess) { + rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j]; + rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j]; + } else { + rsd[i][j] = udir[i][j] / poli[i]; + rsdp[i][j] = udirp[i][j] / poli[i]; + } + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm_pair(this); + uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm_pair(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j]; + conjp[i][j] = zrsdp[i][j]; + } + } + + // conjugate gradient iteration of the mutual induced dipoles + + while (!done) { + iter++; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + vec[i][j] = uind[i][j]; + vecp[i][j] = uinp[i][j]; + uind[i][j] = conj[i][j]; + uinp[i][j] = conjp[i][j]; + } + } + + cfstyle = INDUCE; + comm->forward_comm_pair(this); + + ufield0c(field,fieldp); + + //error->all(FLERR,"STOP"); + + crstyle = FIELD; + comm->reverse_comm_pair(this); + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = vec[i][j]; + uinp[i][j] = vecp[i][j]; + vec[i][j] = conj[i][j]/poli[i] - field[i][j]; + vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j]; + } + } + + a = 0.0; + ap = 0.0; + sum = 0.0; + sump = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + a += conj[i][j]*vec[i][j]; + ap += conjp[i][j]*vecp[i][j]; + sum += rsd[i][j]*zrsd[i][j]; + sump += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = a; + reduce[1] = ap; + reduce[2] = sum; + reduce[3] = sump; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + a = allreduce[0]; + ap = allreduce[1]; + sum = allreduce[2]; + sump = allreduce[3]; + + if (a != 0.0) a = sum / a; + if (ap != 0.0) ap = sump / ap; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uind[i][j] + a*conj[i][j]; + uinp[i][j] = uinp[i][j] + ap*conjp[i][j]; + rsd[i][j] = rsd[i][j] - a*vec[i][j]; + rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j]; + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm_pair(this); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm_pair(this); + } + + b = 0.0; + bp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + b += rsd[i][j]*zrsd[i][j]; + bp += rsdp[i][j]*zrsdp[i][j]; + } + } + + // NOTE: comp of b,bp and allreduce only needed if pcgprec ? + + reduce[0] = b; + reduce[1] = bp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + b = allreduce[0]; + bp = allreduce[1]; + + if (sum != 0.0) b /= sum; + if (sump != 0.0) bp /= sump; + + epsd = 0.0; + epsp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j] + b*conj[i][j]; + conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j]; + epsd += rsd[i][j]*rsd[i][j]; + epsp += rsdp[i][j]*rsdp[i][j]; + } + } + + reduce[0] = epsd; + reduce[1] = epsp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + epsd = allreduce[0]; + epsp = allreduce[1]; + + // check the convergence of the mutual induced dipoles + + epsold = eps; + eps = MAX(epsd,epsp); + eps = DEBYE * sqrt(eps/atom->natoms); + + if (eps < poleps) done = true; + if (eps > epsold) done = true; + if (iter >= politer) done = true; + + // apply a "peek" iteration to the mutual induced dipoles + + if (done) { + for (i = 0; i < nlocal; i++) { + term = pcgpeek * poli[i]; + for (j = 0; j < 3; j++) { + uind[i][j] += term*rsd[i][j]; + uinp[i][j] += term*rsdp[i][j]; + } + } + } + + } + + // terminate the calculation if dipoles failed to converge + // NOTE: could make this an error + + if (iter >= maxiter || eps > epsold) + if (me == 0) + error->warning(FLERR,"AMOEBA induced dipoles did not converge"); + } + + // DEBUG output to dump file + + if (uind_flag) + dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp); + + // deallocation of arrays + + memory->destroy(poli); + memory->destroy(conj); + memory->destroy(conjp); + memory->destroy(vec); + memory->destroy(vecp); + memory->destroy(udir); + memory->destroy(usum); + memory->destroy(usump); + + // update the lists of previous induced dipole values + // shift previous m values up to m+1, add new values at m = 0 + // only when preconditioner is used + + if (use_pred) { + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + nualt = MIN(nualt+1,maxualt); + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + for (m = nualt-1; m > 0; m--) { + udalt[i][m][j] = udalt[i][m-1][j]; + upalt[i][m][j] = upalt[i][m-1][j]; + } + udalt[i][0][j] = uind[i][j]; + upalt[i][0][j] = uinp[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- + dfield0c = direct induction via Ewald sum + dfield0c computes the mutual electrostatic field due to + permanent multipole moments via Ewald summation +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::dfield0c(double **field, double **fieldp) +{ + int i,j,ii; + double term; + + int inum; + int *ilist; + + // zero out field,fieldp for owned and ghost atoms + + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + for (i = 0; i < nall; i++) { + for (j = 0; j < 3; j++) { + field[i][j] = 0.0; + fieldp[i][j] = 0.0; + } + } + + // get the reciprocal space part of the permanent field + + if (kspace_flag) udirect1(field); + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fieldp[i][j] = field[i][j]; + } + } + + // get the real space portion of the permanent field + + if (rspace_flag) udirect2b(field,fieldp); + + // get the self-energy portion of the permanent field + + term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + field[i][j] += term*rpole[i][j+1]; + fieldp[i][j] += term*rpole[i][j+1]; + } + } +} + /* ---------------------------------------------------------------------- udirect2b = Ewald real direct field via list udirect2b computes the real space contribution of the permanent @@ -298,7 +787,7 @@ void PairAmoebaGPU::init_style() void PairAmoebaGPU::udirect2b(double **field, double **fieldp) { - bool gpu_udirect2b_ready = true; + bool gpu_udirect2b_ready = false; if (!gpu_udirect2b_ready) { PairAmoeba::udirect2b(field, fieldp); return; diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index e5d4aab176..9f538ca903 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -34,6 +34,8 @@ class PairAmoebaGPU : public PairAmoeba { enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; virtual void polar_real(); + virtual void induce(); + virtual void dfield0c(double **, double **); virtual void udirect2b(double **, double **); private: From be5aa46df82b1aaa78250f3228465c6f7260c17a Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 3 Sep 2021 17:32:41 -0500 Subject: [PATCH 014/181] Re-arranged the binsize call from the GPU lib in Atom so that the box bounds and bininv[xyz] are computed on the CPU side intact --- examples/amoeba/in.ubiquitin | 2 +- src/atom.cpp | 44 ++++++++++++------------------------ 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index f017d8f122..2491493c45 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -4,7 +4,7 @@ units real boundary p p p atom_style amoeba -atom_modify sort 1000 7.0 +#atom_modify sort 1000 7.0 bond_style class2 angle_style amoeba dihedral_style none diff --git a/src/atom.cpp b/src/atom.cpp index 86e2b1151b..71cb2e9f31 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -2195,6 +2195,21 @@ void Atom::setup_sort_bins() return; } +#ifdef LMP_GPU + if (userbinsize == 0.0) { + int ifix = modify->find_fix("package_gpu"); + if (ifix >= 0) { + const double subx = domain->subhi[0] - domain->sublo[0]; + const double suby = domain->subhi[1] - domain->sublo[1]; + const double subz = domain->subhi[2] - domain->sublo[2]; + + FixGPU *fix = static_cast(modify->fix[ifix]); + binsize = fix->binsize(subx, suby, subz, atom->nlocal, + 0.5 * neighbor->cutneighmax); + } + } +#endif + double bininv = 1.0/binsize; // nbin xyz = local bins @@ -2271,35 +2286,6 @@ void Atom::setup_sort_bins() } #endif -#ifdef LMP_GPU - if (userbinsize == 0.0) { - int ifix = modify->find_fix("package_gpu"); - if (ifix >= 0) { - const double subx = domain->subhi[0] - domain->sublo[0]; - const double suby = domain->subhi[1] - domain->sublo[1]; - const double subz = domain->subhi[2] - domain->sublo[2]; - - FixGPU *fix = static_cast(modify->fix[ifix]); - binsize = fix->binsize(subx, suby, subz, atom->nlocal, - neighbor->cutneighmax); - bininv = 1.0 / binsize; - - nbinx = static_cast (ceil(subx * bininv)); - nbiny = static_cast (ceil(suby * bininv)); - nbinz = static_cast (ceil(subz * bininv)); - if (domain->dimension == 2) nbinz = 1; - - if (nbinx == 0) nbinx = 1; - if (nbiny == 0) nbiny = 1; - if (nbinz == 0) nbinz = 1; - - bininvx = bininv; - bininvy = bininv; - bininvz = bininv; - } - } -#endif - if (1.0*nbinx*nbiny*nbinz > INT_MAX) error->one(FLERR,"Too many atom sorting bins"); From 4e346c2de637d32f33d6b59c798b22b12b1e56df Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 7 Sep 2021 13:05:57 -0500 Subject: [PATCH 015/181] Refactored neighbor list builds and per-atom reallocation parts --- lib/gpu/lal_amoeba_ext.cpp | 5 +- lib/gpu/lal_base_amoeba.cpp | 141 +++++++++++++++++++++--------------- lib/gpu/lal_base_amoeba.h | 20 ++++- src/GPU/pair_amoeba_gpu.cpp | 19 ++--- 4 files changed, 107 insertions(+), 78 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 9fa3c7f75b..59739f9f2a 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -127,6 +127,7 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, @@ -135,8 +136,8 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, bool &success, double *host_q, double *boxlo, double *prd, void **fieldp_ptr) { return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, sublo, subhi, - tag, nspecial, special, nspecial15, special15, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd, fieldp_ptr); } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index a1cf516777..88caec3972 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -121,9 +121,12 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, ef_nall=2000; _max_tep_size=static_cast(static_cast(ef_nall)*1.10); + _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _max_fieldp_size = _max_tep_size; _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _nmax = nall; dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); @@ -312,10 +315,12 @@ void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute polar real-space +// Prepare for multiple kernel calls in a time step: +// - reallocate per-atom arrays, if needed +// - build the full neighbor lists for use by different kernels // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall, +int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -324,9 +329,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int *nspecial15, tagint **special15, const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, + int **&ilist, int **&jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, - double *prd, void **tep_ptr) { + double *prd) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -343,12 +348,10 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const set_kernel(eflag,vflag); - // ------------------- Resize _tep array ------------------------ - - if (nall>_max_tep_size) { - _max_tep_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_tep_size*4); + // ------------------- Resize 1-5 neighbor arrays ------------------------ + if (nall>_nmax) { + _nmax = nall; dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -356,7 +359,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } - *tep_ptr=_tep.host.begin(); if (inum_full==0) { host_start=0; @@ -397,6 +399,60 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, boxlo, prd); + return nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void **tep_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // reallocate per-atom arrays and build the neighbor lists if needed + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _tep array ------------------------ + + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); + } + *tep_ptr=_tep.host.begin(); + const int red_blocks=polar_real(eflag,vflag); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); @@ -412,7 +468,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } */ - return nbor->host_jlist.begin()-host_start; + return firstneigh; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -423,6 +479,7 @@ template int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -447,59 +504,26 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i set_kernel(eflag,vflag); + // reallocate per-atom arrays and build the neighbor lists if needed + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + // ------------------- Resize _fieldp array ------------------------ if (nall>_max_fieldp_size) { _max_fieldp_size=static_cast(static_cast(nall)*1.10); _fieldp.resize(_max_fieldp_size*8); - - dev_nspecial15.clear(); - dev_special15.clear(); - dev_special15_t.clear(); - dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } *fieldp_ptr=_fieldp.host.begin(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - resize_atom(0,nall,success); - zero_timers(); - return nullptr; - } - - hd_balancer.balance(cpu_time); - int inum=hd_balancer.get_gpu_count(ago,inum_full); - ans->inum(inum); - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - success); - if (!success) - return nullptr; - atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr); - hd_balancer.start_timer(); - } else { - atom->cast_x_data(host_x,host_type); - atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr); - hd_balancer.start_timer(); - atom->add_x_data(host_x,host_type); - } - atom->add_q_data(); - atom->add_extra_data(); - - *ilist=nbor->host_ilist.begin(); - *jnum=nbor->host_acc.begin(); - const int red_blocks=udirect2b(eflag,vflag); - hd_balancer.stop_timer(); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -510,9 +534,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); } -*/ - - return nbor->host_jlist.begin()-host_start; +*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; } template diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index ae0f33ef29..7d4f4c00b5 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -128,6 +128,18 @@ class BaseAmoeba { tagint **special, int *nspecial15, tagint **special15, bool &success); + /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed + int** precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); + /// Compute polar real-space with host neighboring (not active for now) void compute_polar_real(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -153,7 +165,9 @@ class BaseAmoeba { /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring int** compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, const bool eflag, const bool vflag, @@ -191,7 +205,7 @@ class BaseAmoeba { /// Per-atom arrays UCL_Vector _tep, _fieldp; - int _max_tep_size, _max_fieldp_size; + int _nmax, _max_tep_size, _max_fieldp_size; // ------------------------ FORCE/ENERGY DATA ----------------------- @@ -222,7 +236,7 @@ class BaseAmoeba { bool _compiled; int _block_size, _block_bio_size, _threads_per_atom; int _extra_fields; - double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15; + double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index f2ba3acceb..d87e35cdf8 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -66,7 +66,8 @@ void amoeba_gpu_clear(); int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, @@ -370,17 +371,7 @@ void PairAmoebaGPU::induce() crstyle = FIELD; comm->reverse_comm_pair(this); - // DEBUG statements - /* - for (i = 0; i < nlocal; i++) - if (atom->tag[i] == 1) - printf("AAA FIELD atom %d: field %g %g %g: fieldp %g %g %g\n", - atom->tag[i], - field[i][0],field[i][1],field[i][2], - fieldp[i][0],fieldp[i][1],fieldp[i][2]); - */ - // set induced dipoles to polarizability times direct field for (i = 0; i < nlocal; i++) { @@ -799,7 +790,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -814,8 +805,8 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) inum = atom->nlocal; firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, sublo, - subhi, atom->tag, atom->nspecial, atom->special, + atom->type, amtype, amgroup, rpole, uind, uinp, + sublo, subhi, atom->tag, atom->nspecial, atom->special, atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, From 1c5d235f12799f6ce3b68dfbe5903fcd84840cc1 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 7 Sep 2021 16:15:08 -0500 Subject: [PATCH 016/181] Working on the field and fieldp values from GPU back to the host for dfield0c --- lib/gpu/lal_amoeba.cu | 22 ++++++++++++++-------- lib/gpu/lal_base_amoeba.cpp | 7 ++++--- src/GPU/pair_amoeba_gpu.cpp | 32 +++++++++++++++++--------------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index adcff0e648..c4f146a7c9 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -116,9 +116,13 @@ _texture( q_tex,int2); } \ if (offset==0 && ii1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ @@ -166,9 +170,13 @@ _texture( q_tex,int2); } \ if (offset==0 && ii_field.cols(), _max_fieldp_size); + + printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", + this->_fieldp.cols(), _max_fieldp_size); for (int i = 0; i < 10; i++) { numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); } -*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; } diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index d87e35cdf8..6501376dfa 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -367,10 +367,10 @@ void PairAmoebaGPU::induce() dfield0c(field,fieldp); // reverse comm to sum field,fieldp from ghost atoms to owned atoms - +/* crstyle = FIELD; comm->reverse_comm_pair(this); - +*/ // set induced dipoles to polarizability times direct field @@ -778,7 +778,7 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp) void PairAmoebaGPU::udirect2b(double **field, double **fieldp) { - bool gpu_udirect2b_ready = false; + bool gpu_udirect2b_ready = true; if (!gpu_udirect2b_ready) { PairAmoeba::udirect2b(field, fieldp); return; @@ -815,31 +815,33 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) if (!success) error->one(FLERR,"Insufficient memory on accelerator"); - // get field and fieldp values from the GPU lib + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + udirect2b_cpu(); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) int nlocal = atom->nlocal; double *field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - field[i][0] = field_ptr[idx]; - field[i][1] = field_ptr[idx+1]; - field[i][2] = field_ptr[idx+2]; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; fieldp_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] = fieldp_ptr[idx]; - fieldp[i][1] = fieldp_ptr[idx+1]; - fieldp[i][2] = fieldp_ptr[idx+2]; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; } - - // rebuild dipole-dipole pair list and store pairwise dipole matrices - // done one atom at a time in real-space double loop over atoms & neighs - - udirect2b_cpu(); + } /* ---------------------------------------------------------------------- From 8c5a116d30d391f9dac1c03f6be225332b956ad1 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 8 Sep 2021 16:43:33 -0500 Subject: [PATCH 017/181] Made dfield0c work to compute uind and uinp correctly; need to make sure they are correct for polar_real() --- lib/gpu/lal_base_amoeba.cpp | 4 +-- src/AMOEBA/amoeba_induce.cpp | 63 ++++++++++++++++++++++++++++++--- src/GPU/pair_amoeba_gpu.cpp | 67 ++++++++++++++++++++++++++++++------ 3 files changed, 117 insertions(+), 17 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 6800288093..26af83ab25 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -528,14 +528,14 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i // copy field and fieldp from device to host (_fieldp store both arrays, one after another) _fieldp.update_host(_max_fieldp_size*8,false); - +/* printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_fieldp.cols(), _max_fieldp_size); for (int i = 0; i < 10; i++) { numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); } - +*/ return firstneigh; //nbor->host_jlist.begin()-host_start; } diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index c8f361053c..b1e6fa3f5d 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -25,6 +25,7 @@ #include "my_page.h" #include "math_const.h" #include "memory.h" +#include "neighbor.h" #include "error.h" using namespace LAMMPS_NS; @@ -103,14 +104,21 @@ void PairAmoeba::induce() memory->create(usump,nlocal,3,"ameoba/induce:usump"); // get the electrostatic field due to permanent multipoles - + dfield0c(field,fieldp); // reverse comm to sum field,fieldp from ghost atoms to owned atoms crstyle = FIELD; comm->reverse_comm_pair(this); - +/* + printf("CPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < nlocal; i++) { + printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", + i, field[i][0], field[i][1], field[i][2], + fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } +*/ // DEBUG statements /* @@ -135,7 +143,14 @@ void PairAmoeba::induce() } } } - +/* + printf("CPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < 10; i++) { + printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", + i, udir[i][0], udir[i][1], udir[i][2], + udirp[i][0], udirp[i][1], udirp[i][2]); + } +*/ // DEBUG statements /* @@ -250,12 +265,30 @@ void PairAmoeba::induce() cfstyle = INDUCE; comm->forward_comm_pair(this); - +/* + if (comm->me == 0) { + printf("CPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < 20; i++) { + printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n", + i, uind[i][0], uind[i][1], uind[i][2], + uinp[i][0], uinp[i][1], uinp[i][2]); + } + } +*/ ufield0c(field,fieldp); crstyle = FIELD; comm->reverse_comm_pair(this); - +/* + if (comm->me == 0) { + printf("CPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < nlocal; i++) { + printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", + i, field[i][0], field[i][1], field[i][2], + fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } + } +*/ // DEBUG statements /* @@ -342,6 +375,16 @@ void PairAmoeba::induce() crstyle = FIELD; comm->reverse_comm_pair(this); +/* + if (comm->me == 0) { + printf("CPU: iter = %d\n", iter); + for (i = 0; i < 10; i++) { + printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", + i, field[i][0], field[i][1], field[i][2], + fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } + } +*/ // DEBUG statements @@ -537,6 +580,7 @@ void PairAmoeba::induce() error->warning(FLERR,"AMOEBA induced dipoles did not converge"); } + // DEBUG output to dump file if (uind_flag) @@ -553,6 +597,15 @@ void PairAmoeba::induce() memory->destroy(usum); memory->destroy(usump); + if (comm->me == 0) { + printf("CPU: iter = %d\n", iter); + for (i = 0; i < 20; i++) { + printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n", + i, uind[i][0], uind[i][1], uind[i][2], + uinp[i][0], uinp[i][1], uinp[i][2]); + } + } + // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 6501376dfa..cd577af912 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -260,7 +260,7 @@ void PairAmoebaGPU::init_style() } } - // select the cutoff (off2) for neighbor list builds (the polar term for now) + // select the squared cutoff (off2) for neighbor list builds (the polar term for now) // NOTE: induce and polar terms are using the same flags here if (use_ewald) choose(POLAR_LONG); @@ -365,13 +365,10 @@ void PairAmoebaGPU::induce() // get the electrostatic field due to permanent multipoles dfield0c(field,fieldp); - - // reverse comm to sum field,fieldp from ghost atoms to owned atoms /* crstyle = FIELD; comm->reverse_comm_pair(this); */ - // set induced dipoles to polarizability times direct field for (i = 0; i < nlocal; i++) { @@ -385,7 +382,14 @@ void PairAmoebaGPU::induce() } } } - +/* + printf("GPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < 10; i++) { + printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", + i, udir[i][0], udir[i][1], udir[i][2], + udirp[i][0], udirp[i][1], udirp[i][2]); + } +*/ // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating // uopt,uoptp with a optorder+1 dimension, just optorder ?? @@ -489,13 +493,30 @@ void PairAmoebaGPU::induce() cfstyle = INDUCE; comm->forward_comm_pair(this); - +/* + if (comm->me == 0) { + printf("GPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < 20; i++) { + printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n", + i, uind[i][0], uind[i][1], uind[i][2], + uinp[i][0], uinp[i][1], uinp[i][2]); + } + } +*/ ufield0c(field,fieldp); crstyle = FIELD; comm->reverse_comm_pair(this); - - +/* + if (comm->me == 0) { + printf("GPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < nlocal; i++) { + printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", + i, field[i][0], field[i][1], field[i][2], + fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } + } +*/ // set initial conjugate gradient residual and conjugate vector for (i = 0; i < nlocal; i++) { @@ -554,7 +575,16 @@ void PairAmoebaGPU::induce() crstyle = FIELD; comm->reverse_comm_pair(this); - +/* + if (comm->me == 0) { + printf("GPU: iter = %d\n", iter); + for (i = 0; i < 10; i++) { + printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", + i, field[i][0], field[i][1], field[i][2], + fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } + } +*/ for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = vec[i][j]; @@ -697,6 +727,15 @@ void PairAmoebaGPU::induce() memory->destroy(usum); memory->destroy(usump); + if (comm->me == 0) { + printf("GPU: iter = %d\n", iter); + for (i = 0; i < 20; i++) { + printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n", + i, uind[i][0], uind[i][1], uind[i][2], + uinp[i][0], uinp[i][1], uinp[i][2]); + } + } + // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used @@ -758,7 +797,7 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp) // get the real space portion of the permanent field if (rspace_flag) udirect2b(field,fieldp); - + // get the self-energy portion of the permanent field term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; @@ -768,6 +807,14 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp) fieldp[i][j] += term*rpole[i][j+1]; } } +/* + printf("GPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < nlocal; i++) { + printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", + i, field[i][0], field[i][1], field[i][2], + fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } +*/ } /* ---------------------------------------------------------------------- From 6f6fd0999c324f679b263330ea9fad1aad725c10 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 9 Sep 2021 00:57:21 -0500 Subject: [PATCH 018/181] Both udirect2b and polar_real are working correctly on the GPU --- lib/gpu/lal_atom.h | 6 +- lib/gpu/lal_base_amoeba.cpp | 158 +++++++++++++++++++---------------- src/AMOEBA/amoeba_induce.cpp | 9 -- src/GPU/pair_amoeba_gpu.cpp | 34 ++++++-- src/GPU/pair_amoeba_gpu.h | 3 + 5 files changed, 121 insertions(+), 89 deletions(-) diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index ff335fffa9..842257a592 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -284,7 +284,11 @@ class Atom { /// Signal that we need to transfer atom data for next timestep inline void data_unavail() - { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; } + { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; } + + /// Signal that we need to transfer atom extra data for next kernel call + inline void extra_data_unavail() + { _extra_avail=false; } typedef struct { double x,y,z; } vec3d; typedef struct { numtyp x,y,z,w; } vec4d_t; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 26af83ab25..9baa7b30d3 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -317,8 +317,10 @@ void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const // --------------------------------------------------------------------------- // Prepare for multiple kernel calls in a time step: // - reallocate per-atom arrays, if needed +// - transfer extra data from host to device // - build the full neighbor lists for use by different kernels // --------------------------------------------------------------------------- + template int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -402,75 +404,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall return nbor->host_jlist.begin()-host_start; } -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute polar real-space -// --------------------------------------------------------------------------- -template -int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void **tep_ptr) { - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); - - // reallocate per-atom arrays and build the neighbor lists if needed - - int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); - - // ------------------- Resize _tep array ------------------------ - - if (nall>_max_tep_size) { - _max_tep_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_tep_size*4); - } - *tep_ptr=_tep.host.begin(); - - const int red_blocks=polar_real(eflag,vflag); - ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - device->add_ans_object(ans); - hd_balancer.stop_timer(); - - // copy tep from device to host - - _tep.update_host(_max_tep_size*4,false); -/* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_tep[4*i]); - printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ - return firstneigh; // nbor->host_jlist.begin()-host_start; -} - // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute the direct real space part // of the permanent field @@ -504,7 +437,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i set_kernel(eflag,vflag); - // reallocate per-atom arrays and build the neighbor lists if needed + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed int** firstneigh = nullptr; firstneigh = precompute(ago, inum_full, nall, host_x, host_type, @@ -539,6 +473,85 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i return firstneigh; //nbor->host_jlist.begin()-host_start; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void **tep_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _tep array ------------------------ + + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); + } + *tep_ptr=_tep.host.begin(); + + const int red_blocks=polar_real(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + // copy tep from device to host + + _tep.update_host(_max_tep_size*4,false); +/* + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_tep[4*i]); + printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + + template double BaseAmoebaT::host_memory_usage_atomic() const { return device->atom.host_memory_usage()+nbor->host_memory_usage()+ @@ -548,6 +561,11 @@ double BaseAmoebaT::host_memory_usage_atomic() const { template void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, double** uind, double** uinp) { + + // signal that we need to transfer extra data from the host + + atom->extra_data_unavail(); + int _nall=atom->nall(); numtyp *pextra=reinterpret_cast(&(atom->extra[0])); diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index b1e6fa3f5d..2ffd4d275b 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -597,15 +597,6 @@ void PairAmoeba::induce() memory->destroy(usum); memory->destroy(usump); - if (comm->me == 0) { - printf("CPU: iter = %d\n", iter); - for (i = 0; i < 20; i++) { - printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n", - i, uind[i][0], uind[i][1], uind[i][2], - uinp[i][0], uinp[i][1], uinp[i][2]); - } - } - // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index cd577af912..0c9ff4c780 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -98,6 +98,10 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) suffix_flag |= Suffix::GPU; fieldp_pinned = nullptr; tep_pinned = nullptr; + + gpu_udirect2b_ready = true; + gpu_polar_real_ready = true; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -114,7 +118,6 @@ PairAmoebaGPU::~PairAmoebaGPU() void PairAmoebaGPU::polar_real() { - bool gpu_polar_real_ready = true; if (!gpu_polar_real_ready) { PairAmoeba::polar_real(); return; @@ -139,7 +142,16 @@ void PairAmoebaGPU::polar_real() domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } inum = atom->nlocal; - +/* + if (comm->me == 0) { + printf("GPU: polar real\n"); + for (int i = 0; i < 20; i++) { + printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n", + i, uind[i][0], uind[i][1], uind[i][2], + uinp[i][0], uinp[i][1], uinp[i][2]); + } + } +*/ firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, uind, uinp, sublo, subhi, @@ -200,6 +212,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr) tep[0] = tep_ptr[4*i]; tep[1] = tep_ptr[4*i+1]; tep[2] = tep_ptr[4*i+2]; + torque2force(i,tep,fix,fiy,fiz,fpolar); iz = zaxis2local[i]; @@ -365,10 +378,14 @@ void PairAmoebaGPU::induce() // get the electrostatic field due to permanent multipoles dfield0c(field,fieldp); -/* - crstyle = FIELD; - comm->reverse_comm_pair(this); -*/ + + // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only + + if (!gpu_udirect2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } + // set induced dipoles to polarizability times direct field for (i = 0; i < nlocal; i++) { @@ -726,7 +743,7 @@ void PairAmoebaGPU::induce() memory->destroy(udir); memory->destroy(usum); memory->destroy(usump); - +/* if (comm->me == 0) { printf("GPU: iter = %d\n", iter); for (i = 0; i < 20; i++) { @@ -735,7 +752,7 @@ void PairAmoebaGPU::induce() uinp[i][0], uinp[i][1], uinp[i][2]); } } - +*/ // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used @@ -825,7 +842,6 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp) void PairAmoebaGPU::udirect2b(double **field, double **fieldp) { - bool gpu_udirect2b_ready = true; if (!gpu_udirect2b_ready) { PairAmoeba::udirect2b(field, fieldp); return; diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 9f538ca903..d4ab9bcdfd 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -45,6 +45,9 @@ class PairAmoebaGPU : public PairAmoeba { void *fieldp_pinned; bool tep_single; + bool gpu_polar_real_ready; + bool gpu_udirect2b_ready; + void udirect2b_cpu(); template From 4a75a9bdd2b38f70b2e0da8b3e7054b46082efdb Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 9 Sep 2021 14:47:29 -0500 Subject: [PATCH 019/181] Removed dfield0c from ameoba/gpu (no need to override this one) --- src/GPU/pair_amoeba_gpu.cpp | 60 +------------------------------------ src/GPU/pair_amoeba_gpu.h | 4 +-- 2 files changed, 3 insertions(+), 61 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 0c9ff4c780..3280e7b093 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -100,7 +100,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) tep_pinned = nullptr; gpu_udirect2b_ready = true; - gpu_polar_real_ready = true; + gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -775,64 +775,6 @@ void PairAmoebaGPU::induce() } } -/* ---------------------------------------------------------------------- - dfield0c = direct induction via Ewald sum - dfield0c computes the mutual electrostatic field due to - permanent multipole moments via Ewald summation -------------------------------------------------------------------------- */ - -void PairAmoebaGPU::dfield0c(double **field, double **fieldp) -{ - int i,j,ii; - double term; - - int inum; - int *ilist; - - // zero out field,fieldp for owned and ghost atoms - - int nlocal = atom->nlocal; - int nall = nlocal + atom->nghost; - - for (i = 0; i < nall; i++) { - for (j = 0; j < 3; j++) { - field[i][j] = 0.0; - fieldp[i][j] = 0.0; - } - } - - // get the reciprocal space part of the permanent field - - if (kspace_flag) udirect1(field); - - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - fieldp[i][j] = field[i][j]; - } - } - - // get the real space portion of the permanent field - - if (rspace_flag) udirect2b(field,fieldp); - - // get the self-energy portion of the permanent field - - term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - field[i][j] += term*rpole[i][j+1]; - fieldp[i][j] += term*rpole[i][j+1]; - } - } -/* - printf("GPU: cutghost = %f\n", comm->cutghost[0]); - for (i = 0; i < nlocal; i++) { - printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", - i, field[i][0], field[i][1], field[i][2], - fieldp[i][0], fieldp[i][1], fieldp[i][2]); - } -*/ -} /* ---------------------------------------------------------------------- udirect2b = Ewald real direct field via list diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index d4ab9bcdfd..d0cbad90a2 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -33,9 +33,9 @@ class PairAmoebaGPU : public PairAmoeba { enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; - virtual void polar_real(); virtual void induce(); - virtual void dfield0c(double **, double **); + + virtual void polar_real(); virtual void udirect2b(double **, double **); private: From efe0bf593f531721f0a7eb00c570a2f4663db94e Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 9 Sep 2021 15:19:43 -0500 Subject: [PATCH 020/181] Adding the umutual2b kernel, need to create another array for tdipdip on the GPU --- lib/gpu/lal_amoeba.cpp | 31 +++++- lib/gpu/lal_amoeba.cu | 210 ++++++++++++++++++++++++++++++++++++ lib/gpu/lal_amoeba.h | 4 +- lib/gpu/lal_base_amoeba.cpp | 82 +++++++++++++- lib/gpu/lal_base_amoeba.h | 67 +++++++----- src/AMOEBA/pair_amoeba.h | 4 +- src/GPU/pair_amoeba_gpu.cpp | 93 ++++++++++++++-- src/GPU/pair_amoeba_gpu.h | 4 +- 8 files changed, 448 insertions(+), 47 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 0d78a8618a..8bcbd6c4cb 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -58,7 +58,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, - "k_amoeba_polar", "k_amoeba_udirect2b"); + "k_amoeba_polar", "k_amoeba_udirect2b", + "k_amoeba_umutual2b"); if (success!=0) return success; @@ -152,7 +153,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Calculate the real-space permanent field, returning field and fieldp // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { @@ -177,5 +178,31 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Calculate the real-space induced field, returning field and fieldp +// --------------------------------------------------------------------------- +template +int AmoebaT::umutual2b(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int _nall=this->atom->nall(); + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + + this->k_umutual2b.set_size(GX,BX); + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &_aewald, &_off2, + &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + template class Amoeba; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index c4f146a7c9..192f440112 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -907,6 +907,216 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp); } +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + __global numtyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp _fieldp[6]; + for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; + + numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + //numtyp4 xi__; + + if (ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find the field components for Thole polarization damping + + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp scale7 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp) ; + scale3 = (numtyp)1.0 - expdamp ; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); + } + } else { + pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_dscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + scalek = factor_pscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // ii { protected: bool _allocated; - int polar_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); + int umutual2b(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + }; } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 9baa7b30d3..6bcd6c50c7 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -38,6 +38,7 @@ BaseAmoebaT::~BaseAmoeba() { delete nbor; k_polar.clear(); k_udirect2b.clear(); + k_umutual2b.clear(); k_special15.clear(); if (pair_program) delete pair_program; } @@ -55,7 +56,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, const char *k_name_polar, - const char *k_name_udirect2b) { + const char *k_name_udirect2b, + const char *k_name_umutual2b) { screen=_screen; int gpu_nbor=0; @@ -87,7 +89,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b); + compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -230,7 +232,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall, +void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -473,6 +475,75 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i return firstneigh; //nbor->host_jlist.begin()-host_start; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the induced field +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void** fieldp_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer extra data from the host + // and build the neighbor lists if needed + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _fieldp array ------------------------ + + if (nall>_max_fieldp_size) { + _max_fieldp_size=static_cast(static_cast(nall)*1.10); + _fieldp.resize(_max_fieldp_size*8); + } + *fieldp_ptr=_fieldp.host.begin(); + + const int red_blocks=umutual2b(eflag,vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + _fieldp.update_host(_max_fieldp_size*8,false); +/* + printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", + this->_fieldp.cols(), _max_fieldp_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); + printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- @@ -551,7 +622,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const return firstneigh; // nbor->host_jlist.begin()-host_start; } - template double BaseAmoebaT::host_memory_usage_atomic() const { return device->atom.host_memory_usage()+nbor->host_memory_usage()+ @@ -621,7 +691,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_polar, - const char *kname_udirect2b) { + const char *kname_udirect2b, + const char *kname_umutual2b) { if (_compiled) return; @@ -632,6 +703,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_polar.set_function(*pair_program,kname_polar); k_udirect2b.set_function(*pair_program,kname_udirect2b); + k_umutual2b.set_function(*pair_program,kname_umutual2b); k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 7d4f4c00b5..3fb752c97c 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -54,7 +54,8 @@ class BaseAmoeba { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, - const char *kname_polar, const char *kname_udirect2b); + const char *kname_polar, const char *kname_udirect2b, + const char *kname_umutual2b); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -140,15 +141,31 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - /// Compute polar real-space with host neighboring (not active for now) - void compute_polar_real(const int f_ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *charge, - const int nlocal, double *boxlo, double *prd, void **tep_ptr); + /// Compute the real space part of the permanent field (udirect2b) with device neighboring + int** compute_udirect2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, void **fieldp_ptr); + + /// Compute the real space part of the induced field (umutual2b) with device neighboring + int** compute_umutual2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, void **fieldp_ptr); /// Compute polar real-space with device neighboring int** compute_polar_real(const int ago, const int inum_full, const int nall, @@ -162,18 +179,15 @@ class BaseAmoeba { int **ilist, int **numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd, void **tep_ptr); - /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring - int** compute_udirect2b(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd, void **fieldp_ptr); + /// Compute polar real-space with host neighboring (not active for now) + void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd, void **tep_ptr); // -------------------------- DEVICE DATA ------------------------- @@ -224,7 +238,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_polar, k_udirect2b, k_special15; + UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -241,10 +255,13 @@ class BaseAmoeba { UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *kname_polar, const char *kname_udirect2b); + const char *kname_polar, const char *kname_udirect2b, + const char *kname_umutual2b); - virtual int polar_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; + virtual int umutual2b(const int eflag, const int vflag) = 0; + virtual int polar_real(const int eflag, const int vflag) = 0; + }; } diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 0ec601de47..b2318d296e 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -365,9 +365,9 @@ class PairAmoeba : public Pair { void ulspred(); void ufield0c(double **, double **); void uscale0b(int, double **, double **, double **, double **); - virtual void dfield0c(double **, double **); + void dfield0c(double **, double **); void umutual1(double **, double **); - void umutual2b(double **, double **); + virtual void umutual2b(double **, double **); void udirect1(double **); virtual void udirect2b(double **, double **); void dampmut(double, double, double, double *); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3280e7b093..a1c21da3dd 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -74,7 +74,18 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd, void **fieldp_ptr); - +/* +int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd, + void **fieldp_ptr); +*/ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -100,6 +111,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) tep_pinned = nullptr; gpu_udirect2b_ready = true; + gpu_umutual2b_ready = false; gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); @@ -142,16 +154,7 @@ void PairAmoebaGPU::polar_real() domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } inum = atom->nlocal; -/* - if (comm->me == 0) { - printf("GPU: polar real\n"); - for (int i = 0; i < 20; i++) { - printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n", - i, uind[i][0], uind[i][1], uind[i][2], - uinp[i][0], uinp[i][1], uinp[i][2]); - } - } -*/ + firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, uind, uinp, sublo, subhi, @@ -993,6 +996,74 @@ void PairAmoebaGPU::udirect2b_cpu() } } +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::umutual2b(double **field, double **fieldp) +{ + if (!gpu_umutual2b_ready) { + PairAmoeba::umutual2b(field, fieldp); + return; + } +/* + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, uind, uinp, + sublo, subhi, atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd, &fieldp_pinned); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) + + int nlocal = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } +*/ +} + /* ---------------------------------------------------------------------- */ double PairAmoebaGPU::memory_usage() diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index d0cbad90a2..4dc547e469 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -37,6 +37,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void polar_real(); virtual void udirect2b(double **, double **); + virtual void umutual2b(double **, double **); private: int gpu_mode; @@ -45,8 +46,9 @@ class PairAmoebaGPU : public PairAmoeba { void *fieldp_pinned; bool tep_single; - bool gpu_polar_real_ready; bool gpu_udirect2b_ready; + bool gpu_umutual2b_ready; + bool gpu_polar_real_ready; void udirect2b_cpu(); From b654f293ee637c25b8bb0fed92b9872eb7f58e0c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 9 Sep 2021 16:52:27 -0500 Subject: [PATCH 021/181] Working on the umutual2b kernel, the tdipdip values are computed on the fly for now, maybe a seprate neigh list as in the CPU version will be more efficient --- lib/gpu/lal_amoeba.cu | 130 ++++++++---------------------------- src/GPU/pair_amoeba_gpu.cpp | 19 ++++-- 2 files changed, 41 insertions(+), 108 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 192f440112..a4b0063a4f 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -743,9 +743,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, int itype,igroup; numtyp bn[4],bcn[3]; numtyp fid[3],fip[3]; - numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp; - ci = polar1[i].x; // rpole[i][0]; dix = polar1[i].y; // rpole[i][1]; diy = polar1[i].z; // rpole[i][2]; diz = polar1[i].w; // rpole[i][3]; @@ -934,10 +932,9 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + numtyp4* polar4 = (numtyp4*)(&extra[12*nall]); + numtyp4* polar5 = (numtyp4*)(&extra[16*nall]); //numtyp4 xi__; @@ -953,32 +950,13 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, int itype,igroup; numtyp bn[4],bcn[3]; numtyp fid[3],fip[3]; - numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp; - ci = polar1[i].x; // rpole[i][0]; - dix = polar1[i].y; // rpole[i][1]; - diy = polar1[i].z; // rpole[i][2]; - diz = polar1[i].w; // rpole[i][3]; - qixx = polar2[i].x; // rpole[i][4]; - qixy = polar2[i].y; // rpole[i][5]; - qixz = polar2[i].z; // rpole[i][6]; - qiyy = polar2[i].w; // rpole[i][8]; - qiyz = polar3[i].x; // rpole[i][9]; - qizz = polar3[i].y; // rpole[i][12]; itype = polar3[i].z; // amtype[i]; igroup = polar3[i].w; // amgroup[i]; - // debug: - // xi__ = ix; xi__.w = itype; - numtyp pdi = damping[itype].x; - numtyp pti = damping[itype].y; numtyp ddi = damping[itype].z; - numtyp aesq2 = (numtyp)2.0 * aewald*aewald; - numtyp aesq2n = (numtyp)0.0; - if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); - for ( ; nborreverse_comm_pair(this); + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } for (i = 0; i < nlocal; i++) { itype = amtype[i]; @@ -525,8 +527,11 @@ void PairAmoebaGPU::induce() */ ufield0c(field,fieldp); - crstyle = FIELD; - comm->reverse_comm_pair(this); + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } + /* if (comm->me == 0) { printf("GPU: cutghost = %f\n", comm->cutghost[0]); @@ -593,8 +598,10 @@ void PairAmoebaGPU::induce() //error->all(FLERR,"STOP"); - crstyle = FIELD; - comm->reverse_comm_pair(this); + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } /* if (comm->me == 0) { printf("GPU: iter = %d\n", iter); From a22923aee29cb238c3c008363c0cf1eccb477b2d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 9 Sep 2021 17:22:09 -0500 Subject: [PATCH 022/181] Added the API for the umutual kernel, needs work for storing the tdiptdip array --- lib/gpu/lal_amoeba.cu | 2 +- lib/gpu/lal_amoeba_ext.cpp | 54 ++++++++++++++++++++++++------------- src/GPU/pair_amoeba_gpu.cpp | 8 +++--- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index a4b0063a4f..30db5ba334 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1012,7 +1012,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp scalek = factor_uscale; bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - numtyp tdipdip[6]; + numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip tdipdip[0] = -bcn[0] + bcn[1]*xr*xr; tdipdip[1] = bcn[1]*xr*yr; tdipdip[2] = bcn[1]*xr*zr; diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 59739f9f2a..5bb4dea25f 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -105,6 +105,42 @@ void amoeba_gpu_clear() { AMOEBAMF.clear(); } +int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void **fieldp_ptr) { + return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, host_q, boxlo, prd, fieldp_ptr); +} + +int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void **fieldp_ptr) { + return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, host_q, boxlo, prd, fieldp_ptr); +} + int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, @@ -124,24 +160,6 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, host_q, boxlo, prd, tep_ptr); } -int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void **fieldp_ptr) { - return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, host_q, boxlo, prd, fieldp_ptr); -} - double amoeba_gpu_bytes() { return AMOEBAMF.host_memory_usage(); } diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 9f1677f26d..b9ee884fa0 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -74,7 +74,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd, void **fieldp_ptr); -/* + int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -85,7 +85,7 @@ int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nal int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd, void **fieldp_ptr); -*/ + int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -1015,7 +1015,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) PairAmoeba::umutual2b(field, fieldp); return; } -/* + int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; int inum, host_start; @@ -1068,7 +1068,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } -*/ + } /* ---------------------------------------------------------------------- */ From 4ebe5833d33a62c9c1afbfd248ce00bc03e9d596 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 10 Sep 2021 16:51:16 -0500 Subject: [PATCH 023/181] Working on short nbor list for the amoeba kernels (based on what has been done with tersoff and ellipsod, nbor dev_packed needs to be allocated properly) --- lib/gpu/lal_amoeba.cpp | 25 +++++--- lib/gpu/lal_amoeba.cu | 115 +++++++++++++++++++++++++++++++++++- lib/gpu/lal_base_amoeba.cpp | 26 +++++--- lib/gpu/lal_base_amoeba.h | 5 +- src/GPU/pair_amoeba_gpu.cpp | 4 +- 5 files changed, 151 insertions(+), 24 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 8bcbd6c4cb..08b3f1c9a5 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -59,7 +59,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, "k_amoeba_polar", "k_amoeba_udirect2b", - "k_amoeba_umutual2b"); + "k_amoeba_umutual2b", "k_amoeba_short_nbor"); if (success!=0) return success; @@ -157,16 +157,23 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + int ainum=this->ans->inum(); + // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - int _nall=this->atom->nall(); - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); - this->time_pair.start(); - + int GX; + + GX=static_cast(ceil(static_cast(ainum)/BX)); + this->k_short_nbor.set_size(GX,BX); + // NOTE: this->nbor->dev_packed is not allocated!! +/* + this->k_short_nbor.run(&this->atom->x, &_off2, + &this->nbor->dev_nbor, &this->nbor->dev_packed, + &ainum, &nbor_pitch, &this->_threads_per_atom); +*/ + GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 30db5ba334..9df1dbe485 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -781,8 +781,10 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp zr = jx.z - ix.z; numtyp r2 = xr*xr + yr*yr + zr*zr; - if (r2>off2) continue; - + if (r2>off2) { + if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj); + continue; + } numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1091,3 +1093,112 @@ __kernel void k_special15(__global int * dev_nbor, } // if ii } + +/* +__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_, + const numtyp off2, __global int * dev_nbor, + const __global int * dev_packed, + const int inum, const int nbor_pitch, + const int t_per_atom) { + int tid, ii, offset, n_stride, i; + atom_info(t_per_atom,ii,tid,offset); + + int new_numj=0; + + if (ii1) { + for (unsigned int s=t_per_atom/2; s>0; s>>=1) + new_numj += shfl_down(new_numj, s, t_per_atom); + } + if (offset==0 && iipair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b); + compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b, + k_name_umutual2b,k_name_short_nbor); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); _nbor_data=&(nbor->dev_packed); - } else + } else { _nbor_data=&(nbor->dev_nbor); - - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,false,_threads_per_atom); + } + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, + _gpu_host,max_nbors,cell_size,false,_threads_per_atom); if (success!=0) return success; - + // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -223,6 +227,8 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, add_onefive_neighbors(); } + //nbor->copy_unpacked(inum,mn); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -450,7 +456,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - + // ------------------- Resize _fieldp array ------------------------ if (nall>_max_fieldp_size) { @@ -692,7 +698,8 @@ template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b) { + const char *kname_umutual2b, + const char *kname_short_nbor) { if (_compiled) return; @@ -704,6 +711,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_polar.set_function(*pair_program,kname_polar); k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); + k_short_nbor.set_function(*pair_program,kname_short_nbor); k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 3fb752c97c..755f11610f 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -55,7 +55,7 @@ class BaseAmoeba { const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b); + const char *kname_umutual2b, const char *kname_short_nbor); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -239,6 +239,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; + UCL_Kernel k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -256,7 +257,7 @@ class BaseAmoeba { void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b); + const char *kname_umutual2b, const char *kname_short_nbor); virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index b9ee884fa0..c51f741c0a 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_udirect2b_ready = true; gpu_umutual2b_ready = false; - gpu_polar_real_ready = true; + gpu_polar_real_ready = false; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -297,7 +297,7 @@ void PairAmoebaGPU::init_style() // set the energy unit conversion factor for polar real-space calculation double felec = 0.5 * electric / am_dielectric; - + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, special_polar_wscale, special_polar_piscale, special_polar_pscale, atom->nlocal, From 7f5a82dc54e699648d2372a2c52d0cab851d13a0 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 11 Sep 2021 00:34:43 -0500 Subject: [PATCH 024/181] Switched to the short neighbor list implementation in the pre-10Feb21 version (the recent version enforces tpa = 1 for short nbor) --- lib/gpu/lal_amoeba.cpp | 39 ++++++++++++----- lib/gpu/lal_amoeba.cu | 86 ++++++++++++++++++++++--------------- lib/gpu/lal_base_amoeba.cpp | 23 +++++++--- lib/gpu/lal_base_amoeba.h | 7 ++- src/GPU/pair_amoeba_gpu.cpp | 2 +- 5 files changed, 103 insertions(+), 54 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 08b3f1c9a5..3a83f57594 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -141,14 +141,31 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); + // Build the short neighbor list if needed + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } + this->k_polar.set_size(GX,BX); this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_avail = false; + return GX; } @@ -163,20 +180,22 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int GX; + int GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); + + // Build the short neighbor list if needed + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } - GX=static_cast(ceil(static_cast(ainum)/BX)); - this->k_short_nbor.set_size(GX,BX); - // NOTE: this->nbor->dev_packed is not allocated!! -/* - this->k_short_nbor.run(&this->atom->x, &_off2, - &this->nbor->dev_nbor, &this->nbor->dev_packed, - &ainum, &nbor_pitch, &this->_threads_per_atom); -*/ - GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &_aewald, &_off2, &_polar_dscale, &_polar_uscale); diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 9df1dbe485..bcb3aef309 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -196,6 +196,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict sp_polar, const __global int *dev_nbor, const __global int *dev_packed, + const __global int *dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, __global numtyp4 *restrict tep, @@ -255,6 +256,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp; int numj, nbor, nbor_end; + const __global int* nbor_mem=dev_packed; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); @@ -262,6 +264,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, //numtyp qtmp; fetch(qtmp,i,q_tex); //int itype=ix.w; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + ci = polar1[i].x; // rpole[i][0]; dix = polar1[i].y; // rpole[i][1]; diy = polar1[i].z; // rpole[i][2]; @@ -289,7 +299,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, for ( ; nbor global_device; template -BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) { +BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); @@ -100,9 +100,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, } else { _nbor_data=&(nbor->dev_nbor); } - + + bool allocate_packed = false; success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, - _gpu_host,max_nbors,cell_size,false,_threads_per_atom); + _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom); if (success!=0) return success; @@ -126,6 +127,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, if (ef_nall==0) ef_nall=2000; + dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); + _max_tep_size=static_cast(static_cast(ef_nall)*1.10); _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); @@ -158,6 +161,7 @@ void BaseAmoebaT::clear_atomic() { time_pair.clear(); hd_balancer.clear(); + dev_short_nbor.clear(); nbor->clear(); ans->clear(); @@ -195,7 +199,7 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist, // Build neighbor list on device // --------------------------------------------------------------------------- template -inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, +inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, @@ -206,7 +210,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, resize_atom(inum,nall,success); resize_local(inum,host_inum,nbor->max_nbors(),success); if (!success) - return; + return 0; atom->cast_copy_x(host_x,host_type); int mn; @@ -232,6 +236,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; + return mn; } // --------------------------------------------------------------------------- @@ -385,7 +390,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall // Build neighbor list on GPU if necessary if (ago==0) { - build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, nspecial15, special15, success); if (!success) @@ -409,6 +414,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, boxlo, prd); + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 755f11610f..eb8938d7c4 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -123,7 +123,7 @@ class BaseAmoeba { int **firstneigh, bool &success); /// Build neighbor list on device - void build_nbor_list(const int inum, const int host_inum, + int build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -236,6 +236,8 @@ class BaseAmoeba { int add_onefive_neighbors(); + UCL_D_Vec dev_short_nbor; + // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; @@ -251,8 +253,9 @@ class BaseAmoeba { bool _compiled; int _block_size, _block_bio_size, _threads_per_atom; int _extra_fields; - double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15; + double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors; double _gpu_overhead, _driver_overhead; + bool short_nbor_avail; UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index c51f741c0a..9fc2ea5114 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_udirect2b_ready = true; gpu_umutual2b_ready = false; - gpu_polar_real_ready = false; + gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } From c765861851c3464c9d1c93e90bba8c4e75b28aa0 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 11 Sep 2021 01:00:58 -0500 Subject: [PATCH 025/181] Cleaned up and re-arranged the functions to reflect the order of calling in a time step --- lib/gpu/lal_amoeba.cpp | 105 +++-- lib/gpu/lal_amoeba.cu | 890 +++++++++++++++++------------------- lib/gpu/lal_base_amoeba.cpp | 28 +- 3 files changed, 493 insertions(+), 530 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 3a83f57594..6f1e0cfaa9 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -126,49 +126,6 @@ double AmoebaT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Amoeba); } -// --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep -// --------------------------------------------------------------------------- -template -int AmoebaT::polar_real(const int eflag, const int vflag) { - // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - int _nall=this->atom->nall(); - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); - this->time_pair.start(); - - // Build the short neighbor list if needed - if (!this->short_nbor_avail) { - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_off2, &ainum, - &nbor_pitch, &this->_threads_per_atom); - this->short_nbor_avail = true; - } - - this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &this->_tep, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, - &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); - this->time_pair.stop(); - - // Signal that short nbor list is not avail for the next time step - // do it here because polar_real() is the last kernel in a time step at this point - - this->short_nbor_avail = false; - - return GX; -} - // --------------------------------------------------------------------------- // Calculate the real-space permanent field, returning field and fieldp // --------------------------------------------------------------------------- @@ -182,13 +139,13 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); - // Build the short neighbor list if needed + // Build the short neighbor list if not done yet if (!this->short_nbor_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_off2, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); this->short_nbor_avail = true; } @@ -219,9 +176,20 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); + // Build the short neighbor list if not done yet + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } + this->k_umutual2b.set_size(GX,BX); this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &_aewald, &_off2, &_polar_dscale, &_polar_uscale); @@ -230,5 +198,48 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Calculate the polar real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::polar_real(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int _nall=this->atom->nall(); + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } + + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, + &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_avail = false; + + return GX; +} + template class Amoeba; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index bcb3aef309..fb515c69f7 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -185,6 +185,421 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global numtyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp _fieldp[6]; + for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; + + numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + //numtyp4 xi__; + + if (ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find the field components for Thole polarization damping + + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp scale7 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp) ; + scale3 = (numtyp)1.0 - expdamp ; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); + } + } else { + pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_dscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + scalek = factor_pscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // iioff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + numtyp ukx = polar4[j].x; // uind[j][0]; + numtyp uky = polar4[j].y; // uind[j][1]; + numtyp ukz = polar4[j].z; // uind[j][2]; + numtyp ukxp = polar5[j].x; // uinp[j][0]; + numtyp ukyp = polar5[j].y; // uinp[j][1]; + numtyp ukzp = polar5[j].z; // uinp[j][2]; + + numtyp factor_uscale; + + // find terms needed later to compute mutual polarization + // if (poltyp != DIRECT) + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_uscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip + tdipdip[0] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[1] = bcn[1]*xr*yr; + tdipdip[2] = bcn[1]*xr*zr; + tdipdip[3] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[4] = bcn[1]*yr*zr; + tdipdip[5] = -bcn[0] + bcn[1]*zr*zr; + + fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; + fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; + fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; + + fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; + fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; + fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; koff2) continue; + //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); @@ -707,474 +1122,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, offset,eflag,vflag,ans,engv); } -/* ---------------------------------------------------------------------- - udirect2b = Ewald real direct field via list - udirect2b computes the real space contribution of the permanent - atomic multipole moments to the field via a neighbor list -------------------------------------------------------------------------- */ - -__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict damping, - const __global numtyp4 *restrict sp_polar, - const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global numtyp4 *restrict fieldp, - const int inum, const int nall, - const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp off2, - const numtyp polar_dscale, const numtyp polar_uscale) -{ - int tid, ii, offset, i; - atom_info(t_per_atom,ii,tid,offset); - - int n_stride; - local_allocate_store_charge(); - - acctyp _fieldp[6]; - for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - - numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - - //numtyp4 xi__; - - if (ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); - - for ( ; nboroff2) { - if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj); - continue; - } - numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); - numtyp r2inv = rinv*rinv; - numtyp rr1 = rinv; - numtyp rr3 = rr1 * r2inv; - numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; - numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; - - numtyp ck = polar1[j].x; // rpole[j][0]; - numtyp dkx = polar1[j].y; // rpole[j][1]; - numtyp dky = polar1[j].z; // rpole[j][2]; - numtyp dkz = polar1[j].w; // rpole[j][3]; - numtyp qkxx = polar2[j].x; // rpole[j][4]; - numtyp qkxy = polar2[j].y; // rpole[j][5]; - numtyp qkxz = polar2[j].z; // rpole[j][6]; - numtyp qkyy = polar2[j].w; // rpole[j][8]; - numtyp qkyz = polar3[j].x; // rpole[j][9]; - numtyp qkzz = polar3[j].y; // rpole[j][12]; - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; - - numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; - const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; - factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; - if (igroup == jgroup) { - factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; - factor_dscale = polar_dscale; - factor_uscale = polar_uscale; - } else { - factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; - factor_dscale = factor_uscale = (numtyp)1.0; - } - - // intermediates involving moments and separation distance - - numtyp dir = dix*xr + diy*yr + diz*zr; - numtyp qix = qixx*xr + qixy*yr + qixz*zr; - numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; - numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; - numtyp qir = qix*xr + qiy*yr + qiz*zr; - numtyp dkr = dkx*xr + dky*yr + dkz*zr; - numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; - numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; - numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; - numtyp qkr = qkx*xr + qky*yr + qkz*zr; - - // calculate the real space Ewald error function terms - - numtyp ralpha = aewald * r; - numtyp exp2a = ucl_exp(-ralpha*ralpha); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; - bn[0] = _erfc * rinv; - - numtyp aefac = aesq2n; - for (int m = 1; m <= 3; m++) { - numtyp bfac = (numtyp) (m+m-1); - aefac = aesq2 * aefac; - bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; - } - - // find the field components for Thole polarization damping - - numtyp scale3 = (numtyp)1.0; - numtyp scale5 = (numtyp)1.0; - numtyp scale7 = (numtyp)1.0; - numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] - if (damp != (numtyp)0.0) { - numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] - if (pgamma != (numtyp)0.0) { - damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); - if (damp < (numtyp)50.0) { - numtyp expdamp = ucl_exp(-damp) ; - scale3 = (numtyp)1.0 - expdamp ; - scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); - scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); - } - } else { - pgamma = MIN(pti,damping[jtype].y); // thole[jtype] - damp = pgamma * ucl_powr(r/damp,3.0); - if (damp < (numtyp)50.0) { - numtyp expdamp = ucl_exp(-damp); - scale3 = (numtyp)1.0 - expdamp; - scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); - scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); - } - } - } else { // damp == 0: ??? - } - - numtyp scalek = factor_dscale; - bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; - bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; - fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; - fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; - fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; - - scalek = factor_pscale; - bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; - bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; - fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; - fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; - fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; - - _fieldp[0] += fid[0]; - _fieldp[1] += fid[1]; - _fieldp[2] += fid[2]; - _fieldp[3] += fip[0]; - _fieldp[4] += fip[1]; - _fieldp[5] += fip[2]; - } // nbor - - } // iioff2) continue; - - numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); - numtyp r2inv = rinv*rinv; - numtyp rr1 = rinv; - numtyp rr3 = rr1 * r2inv; - numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; - - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; - numtyp ukx = polar4[j].x; // uind[j][0]; - numtyp uky = polar4[j].y; // uind[j][1]; - numtyp ukz = polar4[j].z; // uind[j][2]; - numtyp ukxp = polar5[j].x; // uinp[j][0]; - numtyp ukyp = polar5[j].y; // uinp[j][1]; - numtyp ukzp = polar5[j].z; // uinp[j][2]; - - numtyp factor_uscale; - - // find terms needed later to compute mutual polarization - // if (poltyp != DIRECT) - numtyp scale3 = (numtyp)1.0; - numtyp scale5 = (numtyp)1.0; - numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] - if (damp != (numtyp)0.0) { - numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] - if (pgamma != (numtyp)0.0) { - damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); - if (damp < (numtyp)50.0) { - numtyp expdamp = ucl_exp(-damp); - scale3 = (numtyp)1.0 - expdamp; - scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); - } - } - } else { // damp == 0: ??? - } - - numtyp scalek = factor_uscale; - bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; - bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip - tdipdip[0] = -bcn[0] + bcn[1]*xr*xr; - tdipdip[1] = bcn[1]*xr*yr; - tdipdip[2] = bcn[1]*xr*zr; - tdipdip[3] = -bcn[0] + bcn[1]*yr*yr; - tdipdip[4] = bcn[1]*yr*zr; - tdipdip[5] = -bcn[0] + bcn[1]*zr*zr; - - fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; - fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; - fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; - - fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; - fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; - fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; - - _fieldp[0] += fid[0]; - _fieldp[1] += fid[1]; - _fieldp[2] += fid[2]; - _fieldp[3] += fip[0]; - _fieldp[4] += fip[1]; - _fieldp[5] += fip[2]; - } // nbor - - } // ii> SBBITS & 3; - int j = sj & NEIGHMASK; - tagint jtag = tag[j]; - - if (!which) { - int offset=ii; - for (int k=0; k1) { - for (unsigned int s=t_per_atom/2; s>0; s>>=1) - new_numj += shfl_down(new_numj, s, t_per_atom); - } - if (offset==0 && iidev_nbor); } - bool allocate_packed = false; + bool alloc_packed=false; success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, - _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom); + _gpu_host,max_nbors,cell_size,alloc_packed,_threads_per_atom); if (success!=0) return success; @@ -231,8 +231,6 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, add_onefive_neighbors(); } - //nbor->copy_unpacked(inum,mn); - double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -336,17 +334,17 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f template int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **&ilist, int **&jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd) { + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; From 94d6f7219c999e8b1403be4c4f993c6e850079ae Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 11 Sep 2021 11:22:17 -0500 Subject: [PATCH 026/181] Attempted to reduce the memory footprint of the per-atom arrays --- lib/gpu/lal_base_amoeba.cpp | 17 +++++++++-------- src/GPU/pair_amoeba_gpu.cpp | 3 +++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index d06d7dfa57..cd86170e5f 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -103,7 +103,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, bool alloc_packed=false; success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, - _gpu_host,max_nbors,cell_size,alloc_packed,_threads_per_atom); + _gpu_host,max_nbors,cell_size,alloc_packed, + _threads_per_atom); if (success!=0) return success; @@ -123,7 +124,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, // allocate per-atom array tep - int ef_nall=nall; + int ef_nall=nlocal; //nall; if (ef_nall==0) ef_nall=2000; @@ -413,8 +414,8 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall boxlo, prd); // re-allocate dev_short_nbor if necessary - if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { - int _nmax=static_cast(static_cast(nall)*1.10); + if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(inum_full)*1.10); dev_short_nbor.resize((2+_max_nbors)*_nmax); } @@ -468,8 +469,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i // ------------------- Resize _fieldp array ------------------------ - if (nall>_max_fieldp_size) { - _max_fieldp_size=static_cast(static_cast(nall)*1.10); + if (inum_full>_max_fieldp_size) { + _max_fieldp_size=static_cast(static_cast(inum_full)*1.10); _fieldp.resize(_max_fieldp_size*8); } *fieldp_ptr=_fieldp.host.begin(); @@ -537,8 +538,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i // ------------------- Resize _fieldp array ------------------------ - if (nall>_max_fieldp_size) { - _max_fieldp_size=static_cast(static_cast(nall)*1.10); + if (inum_full>_max_fieldp_size) { + _max_fieldp_size=static_cast(static_cast(inum_full)*1.10); _fieldp.resize(_max_fieldp_size*8); } *fieldp_ptr=_fieldp.host.begin(); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 9fc2ea5114..edd51667aa 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -991,6 +991,9 @@ void PairAmoebaGPU::udirect2b_cpu() tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; tdipdip[ndip++] = bcn[1]*yr*zr; tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; + //printf("i = %d: j = %d: poltyp != DIRECT\n", i, j); + } else { + printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); } } // jj From edd76733a10929ecb3149a928daf7c4399c42d2d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 12 Sep 2021 00:51:48 -0500 Subject: [PATCH 027/181] Working on umutual2b, tdipdip are correct, but incorrect results for field and fieldp --- lib/gpu/lal_amoeba.cu | 35 ++++++++++++++++++++++++++++++++++- lib/gpu/lal_base_amoeba.cpp | 4 ++-- src/AMOEBA/amoeba_induce.cpp | 15 +++++++++++---- src/GPU/pair_amoeba_gpu.cpp | 17 ++++++++++++----- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index fb515c69f7..add17e2725 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -465,6 +465,10 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp pdi = damping[itype].x; numtyp ddi = damping[itype].z; + numtyp aesq2 = (numtyp)2.0 * aewald*aewald; + numtyp aesq2n = (numtyp)0.0; + if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + for ( ; nbor_max_tep_size) { - _max_tep_size=static_cast(static_cast(nall)*1.10); + if (inum_full>_max_tep_size) { + _max_tep_size=static_cast(static_cast(inum_full)*1.10); _tep.resize(_max_tep_size*4); } *tep_ptr=_tep.host.begin(); diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 2ffd4d275b..2294f543dd 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -279,6 +279,10 @@ void PairAmoeba::induce() crstyle = FIELD; comm->reverse_comm_pair(this); + for (int i = 0; i < 10; i++) { + printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } + //error->all(FLERR,"STOP CPU"); /* if (comm->me == 0) { printf("CPU: cutghost = %f\n", comm->cutghost[0]); @@ -369,12 +373,13 @@ void PairAmoeba::induce() cfstyle = INDUCE; comm->forward_comm_pair(this); - ufield0c(field,fieldp); - - //error->all(FLERR,"STOP"); + ufield0c(field,fieldp); crstyle = FIELD; comm->reverse_comm_pair(this); + + + //error->all(FLERR,"STOP"); /* if (comm->me == 0) { printf("CPU: iter = %d\n", iter); @@ -1243,7 +1248,9 @@ void PairAmoeba::umutual2b(double **field, double **fieldp) j = jlist[jj]; uindj = uind[j]; uinpj = uinp[j]; - + //if (i==0 && j == 10) + // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", + // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); fid[0] = tdipdip[0]*uindj[0] + tdipdip[1]*uindj[1] + tdipdip[2]*uindj[2]; fid[1] = tdipdip[1]*uindj[0] + tdipdip[3]*uindj[1] + tdipdip[4]*uindj[2]; fid[2] = tdipdip[2]*uindj[0] + tdipdip[4]*uindj[1] + tdipdip[5]*uindj[2]; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index edd51667aa..bdde1176d9 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -111,7 +111,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) tep_pinned = nullptr; gpu_udirect2b_ready = true; - gpu_umutual2b_ready = false; + gpu_umutual2b_ready = true; gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); @@ -532,6 +532,14 @@ void PairAmoebaGPU::induce() comm->reverse_comm_pair(this); } + if (comm->me == 0) { + for (int i = 0; i < 10; i++) { + printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } + } + + //error->all(FLERR,"STOP GPU"); + /* if (comm->me == 0) { printf("GPU: cutghost = %f\n", comm->cutghost[0]); @@ -596,12 +604,12 @@ void PairAmoebaGPU::induce() ufield0c(field,fieldp); - //error->all(FLERR,"STOP"); - if (!gpu_umutual2b_ready) { crstyle = FIELD; comm->reverse_comm_pair(this); } + + //error->all(FLERR,"STOP"); /* if (comm->me == 0) { printf("GPU: iter = %d\n", iter); @@ -1051,7 +1059,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) error->one(FLERR,"Insufficient memory on accelerator"); // accumulate the field and fieldp values from the GPU lib - // field and fieldp may already have some nonzero values from kspace (udirect1) + // field and fieldp may already have some nonzero values from kspace (umutual1) int nlocal = atom->nlocal; double *field_ptr = (double *)fieldp_pinned; @@ -1071,7 +1079,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } - } /* ---------------------------------------------------------------------- */ From bc665999d5659f820741fd614db488be37c4f47d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 13 Sep 2021 01:11:03 -0500 Subject: [PATCH 028/181] Fixed bugs with the umutual2b kernel, now the field and fieldp seems correct --- lib/gpu/lal_amoeba.cu | 29 ++++++++++------------------- src/GPU/pair_amoeba_gpu.cpp | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index add17e2725..f640690109 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -463,7 +463,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, igroup = polar3[i].w; // amgroup[i]; numtyp pdi = damping[itype].x; - numtyp ddi = damping[itype].z; + numtyp pti = damping[itype].y; numtyp aesq2 = (numtyp)2.0 * aewald*aewald; numtyp aesq2n = (numtyp)0.0; @@ -502,16 +502,8 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp ukzp = polar5[j].z; // uinp[j][2]; numtyp factor_uscale; - //const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; - //factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; - if (igroup == jgroup) { - //factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; - //factor_dscale = polar_dscale; - factor_uscale = polar_uscale; - } else { - //factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; - factor_uscale = (numtyp)1.0; - } + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = (numtyp)1.0; // calculate the real space Ewald error function terms @@ -535,15 +527,14 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp scale5 = (numtyp)1.0; numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] if (damp != (numtyp)0.0) { - numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] - if (pgamma != (numtyp)0.0) { - damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); - if (damp < (numtyp)50.0) { - numtyp expdamp = ucl_exp(-damp); - scale3 = (numtyp)1.0 - expdamp; - scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); - } + numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); } + } else { // damp == 0: ??? } diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index bdde1176d9..b5096b4c1c 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -524,6 +524,14 @@ void PairAmoebaGPU::induce() uinp[i][0], uinp[i][1], uinp[i][2]); } } +*/ +/* + if (comm->me == 0) { + printf("GPU before\n"); + for (int i = 0; i < 10; i++) { + printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]); + } + } */ ufield0c(field,fieldp); @@ -531,12 +539,14 @@ void PairAmoebaGPU::induce() crstyle = FIELD; comm->reverse_comm_pair(this); } - +/* if (comm->me == 0) { + printf("GPU after \n"); for (int i = 0; i < 10; i++) { printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]); } } +*/ //error->all(FLERR,"STOP GPU"); @@ -841,7 +851,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs - udirect2b_cpu(); + //udirect2b_cpu(); // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (udirect1) From 76794bef588d3df305c5fbd76e035fb9dede16f4 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 13 Sep 2021 01:16:42 -0500 Subject: [PATCH 029/181] Removed some of the debugging stuffs --- src/GPU/pair_amoeba_gpu.cpp | 64 +++---------------------------------- 1 file changed, 5 insertions(+), 59 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index b5096b4c1c..640d94972a 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -515,51 +515,16 @@ void PairAmoebaGPU::induce() cfstyle = INDUCE; comm->forward_comm_pair(this); -/* - if (comm->me == 0) { - printf("GPU: cutghost = %f\n", comm->cutghost[0]); - for (i = 0; i < 20; i++) { - printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n", - i, uind[i][0], uind[i][1], uind[i][2], - uinp[i][0], uinp[i][1], uinp[i][2]); - } - } -*/ -/* - if (comm->me == 0) { - printf("GPU before\n"); - for (int i = 0; i < 10; i++) { - printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]); - } - } -*/ + ufield0c(field,fieldp); if (!gpu_umutual2b_ready) { crstyle = FIELD; comm->reverse_comm_pair(this); } -/* - if (comm->me == 0) { - printf("GPU after \n"); - for (int i = 0; i < 10; i++) { - printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]); - } - } -*/ //error->all(FLERR,"STOP GPU"); -/* - if (comm->me == 0) { - printf("GPU: cutghost = %f\n", comm->cutghost[0]); - for (i = 0; i < nlocal; i++) { - printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", - i, field[i][0], field[i][1], field[i][2], - fieldp[i][0], fieldp[i][1], fieldp[i][2]); - } - } -*/ // set initial conjugate gradient residual and conjugate vector for (i = 0; i < nlocal; i++) { @@ -620,16 +585,7 @@ void PairAmoebaGPU::induce() } //error->all(FLERR,"STOP"); -/* - if (comm->me == 0) { - printf("GPU: iter = %d\n", iter); - for (i = 0; i < 10; i++) { - printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", - i, field[i][0], field[i][1], field[i][2], - fieldp[i][0], fieldp[i][1], fieldp[i][2]); - } - } -*/ + for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = vec[i][j]; @@ -771,16 +727,7 @@ void PairAmoebaGPU::induce() memory->destroy(udir); memory->destroy(usum); memory->destroy(usump); -/* - if (comm->me == 0) { - printf("GPU: iter = %d\n", iter); - for (i = 0; i < 20; i++) { - printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n", - i, uind[i][0], uind[i][1], uind[i][2], - uinp[i][0], uinp[i][1], uinp[i][2]); - } - } -*/ + // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used @@ -851,7 +798,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs - //udirect2b_cpu(); + // udirect2b_cpu(); // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (udirect1) @@ -1009,9 +956,8 @@ void PairAmoebaGPU::udirect2b_cpu() tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; tdipdip[ndip++] = bcn[1]*yr*zr; tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; - //printf("i = %d: j = %d: poltyp != DIRECT\n", i, j); } else { - printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); + if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); } } // jj From a21095fded3bb490b905bcf9d22f5cd85c8fda28 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 13 Sep 2021 13:47:15 -0500 Subject: [PATCH 030/181] More cleaning up --- lib/gpu/lal_amoeba.cpp | 3 ++- src/AMOEBA/amoeba_induce.cpp | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 6f1e0cfaa9..5030025981 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -137,7 +137,8 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); // Build the short neighbor list if not done yet if (!this->short_nbor_avail) { diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 2294f543dd..5b855abdd0 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -279,9 +279,7 @@ void PairAmoeba::induce() crstyle = FIELD; comm->reverse_comm_pair(this); - for (int i = 0; i < 10; i++) { - printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]); - } + //error->all(FLERR,"STOP CPU"); /* if (comm->me == 0) { From 98c1a0178c8b491636b4e4328963dcdfb3546911 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 16 Sep 2021 17:14:36 -0500 Subject: [PATCH 031/181] Refactored the API so that different off2 values are used for different kernels --- lib/gpu/lal_amoeba.cpp | 28 +++++++--------- lib/gpu/lal_amoeba.h | 5 ++- lib/gpu/lal_amoeba_ext.cpp | 36 ++++++++++----------- lib/gpu/lal_base_amoeba.cpp | 21 ++++++++---- lib/gpu/lal_base_amoeba.h | 13 +++++--- src/GPU/pair_amoeba_gpu.cpp | 64 ++++++++++++++++++++++--------------- 6 files changed, 92 insertions(+), 75 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 5030025981..8adabbe6d5 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -52,8 +52,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, - const double aewald, const double felec, - const double off2, const double polar_dscale, + const double aewald, const double polar_dscale, const double polar_uscale) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, @@ -97,8 +96,6 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda ucl_copy(sp_polar,dview,5,false); _aewald = aewald; - _felec = felec; - _off2 = off2; _polar_dscale = polar_dscale; _polar_uscale = polar_uscale; @@ -145,7 +142,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, &_off2, &ainum, + &this->dev_short_nbor, &this->_off2_polar, &ainum, &nbor_pitch, &this->_threads_per_atom); this->short_nbor_avail = true; } @@ -155,7 +152,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &_aewald, &_off2, + &this->_threads_per_atom, &_aewald, &this->_off2_polar, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); @@ -181,19 +178,18 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { if (!this->short_nbor_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &_off2, &ainum, - &nbor_pitch, &this->_threads_per_atom); + &this->_nbor_data->begin(), &this->dev_short_nbor, + &this->_off2_polar, &ainum, &nbor_pitch, + &this->_threads_per_atom); this->short_nbor_avail = true; } this->k_umutual2b.set_size(GX,BX); this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->_fieldp, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &_aewald, &_off2, - &_polar_dscale, &_polar_uscale); + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, + &nbor_pitch, &this->_threads_per_atom, &_aewald, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); return GX; @@ -219,7 +215,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, &_off2, &ainum, + &this->dev_short_nbor, &this->_off2_polar, &ainum, &nbor_pitch, &this->_threads_per_atom); this->short_nbor_avail = true; } @@ -230,8 +226,8 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, - &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); + &this->_threads_per_atom, &_aewald, &this->_felec, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); // Signal that short nbor list is not avail for the next time step diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index ea4f8b9d1d..ce30b6ab19 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -45,8 +45,7 @@ class Amoeba : public BaseAmoeba { const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, - const double aewald, const double felec, - const double off2, const double polar_dscale, + const double aewald, const double polar_dscale, const double polar_uscale); /// Clear all host and device data @@ -75,7 +74,7 @@ class Amoeba : public BaseAmoeba { /// Number of atom types int _lj_types; - numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale; + numtyp _aewald, _polar_dscale, _polar_uscale; numtyp _qqrd2e; protected: diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 5bb4dea25f..bbebaa09da 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -36,8 +36,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double aewald, const double felec, - const double off2, const double polar_dscale, + const double aewald, const double polar_dscale, const double polar_uscale, int& tep_size) { AMOEBAMF.clear(); gpu_mode=AMOEBAMF.device->gpu_mode(); @@ -67,7 +66,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, - aewald, felec, off2, polar_dscale, polar_uscale); + aewald, polar_dscale, polar_uscale); AMOEBAMF.device->world_barrier(); if (message) @@ -87,7 +86,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, - aewald, felec, off2, polar_dscale, polar_uscale); + aewald, polar_dscale, polar_uscale); AMOEBAMF.device->gpu_barrier(); if (message) @@ -111,16 +110,16 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void **fieldp_ptr) { + bool &success, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr) { return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, host_q, boxlo, prd, fieldp_ptr); + cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr); } int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full, @@ -132,13 +131,13 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void **fieldp_ptr) { + bool &success, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr) { return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, host_q, boxlo, prd, fieldp_ptr); + cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr); } int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, @@ -147,17 +146,16 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void **tep_ptr) { + bool &success, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, - vatom, host_start, ilist, jnum, cpu_time, success, - host_q, boxlo, prd, tep_ptr); + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr); } double amoeba_gpu_bytes() { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index f4036ec110..2fe0e1e4b8 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -250,7 +250,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double *host_q, const int nlocal, + bool &success, const double off2_polar, const double felec, + double *host_q, const int nlocal, double *boxlo, double *prd, void **tep_ptr) { acc_timers(); int eflag, vflag; @@ -316,6 +317,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, boxlo, prd); + _off2_polar = off2_polar; + _felec = felec; const int red_blocks=polar_real(eflag,vflag); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); @@ -437,8 +440,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void** fieldp_ptr) { + bool &success, const double off2_polar, double *host_q, + double *boxlo, double *prd, void** fieldp_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -475,6 +478,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i } *fieldp_ptr=_fieldp.host.begin(); + _off2_polar = off2_polar; const int red_blocks=udirect2b(eflag,vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -506,8 +510,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void** fieldp_ptr) { + bool &success, const double off2_polar, double *host_q, + double *boxlo, double *prd, void** fieldp_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -544,6 +548,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i } *fieldp_ptr=_fieldp.host.begin(); + _off2_polar = off2_polar; const int red_blocks=umutual2b(eflag,vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -574,8 +579,8 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd, void **tep_ptr) { + bool &success, const double felec, const double off2_polar, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -620,6 +625,8 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const } *tep_ptr=_tep.host.begin(); + _off2_polar = off2_polar; + _felec = felec; const int red_blocks=polar_real(eflag,vflag); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index eb8938d7c4..b14a234e7b 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -152,7 +152,7 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd, void **fieldp_ptr); + const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr); /// Compute the real space part of the induced field (umutual2b) with device neighboring int** compute_umutual2b(const int ago, const int inum_full, const int nall, @@ -165,7 +165,7 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd, void **fieldp_ptr); + const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr); /// Compute polar real-space with device neighboring int** compute_polar_real(const int ago, const int inum_full, const int nall, @@ -177,7 +177,8 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd, void **tep_ptr); + const double felec, const double off2_polar, double *charge, + double *boxlo, double *prd, void **tep_ptr); /// Compute polar real-space with host neighboring (not active for now) void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, @@ -186,8 +187,8 @@ class BaseAmoeba { double **host_uinp, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *charge, - const int nlocal, double *boxlo, double *prd, void **tep_ptr); + const double cpu_time, bool &success, const double felec, const double off2_polar, + double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr); // -------------------------- DEVICE DATA ------------------------- @@ -258,6 +259,8 @@ class BaseAmoeba { bool short_nbor_avail; UCL_D_Vec *_nbor_data; + numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar; + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_polar, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_short_nbor); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 640d94972a..f4ead3c5fa 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -59,8 +59,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double aewald, const double felec, - const double off2, const double polar_dscale, + const double aewald, const double polar_dscale, const double polar_uscale, int& tep_size); void amoeba_gpu_clear(); @@ -69,33 +68,30 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd, - void **fieldp_ptr); + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr); int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd, - void **fieldp_ptr); + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr); int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd, - void **tep_ptr); + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double off2, const double felec, double *host_q, + double *boxlo, double *prd, void **tep_ptr); double amoeba_gpu_bytes(); @@ -155,6 +151,15 @@ void PairAmoebaGPU::polar_real() } inum = atom->nlocal; + // select the correct cutoff for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, uind, uinp, sublo, subhi, @@ -162,7 +167,7 @@ void PairAmoebaGPU::polar_real() atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, + success, felec, off2, atom->q, domain->boxlo, domain->prd, &tep_pinned); @@ -278,11 +283,11 @@ void PairAmoebaGPU::init_style() // select the squared cutoff (off2) for neighbor list builds (the polar term for now) // NOTE: induce and polar terms are using the same flags here - +/* if (use_ewald) choose(POLAR_LONG); else choose(POLAR); - - double cell_size = sqrt(off2) + neighbor->skin; +*/ + double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; int maxspecial15=0; @@ -303,8 +308,7 @@ void PairAmoebaGPU::init_style() special_polar_pscale, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, - aewald, felec, off2, polar_dscale, polar_uscale, - tep_size); + aewald, polar_dscale, polar_uscale, tep_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) @@ -784,13 +788,18 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) } inum = atom->nlocal; + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, uind, uinp, sublo, subhi, atom->tag, atom->nspecial, atom->special, atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, + success, off2, atom->q, domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1003,13 +1012,18 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) } inum = atom->nlocal; + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, uind, uinp, sublo, subhi, atom->tag, atom->nspecial, atom->special, atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, + success, off2, atom->q, domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); From c0b967054e144e74e2365aa50790410006aa540e Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 16 Sep 2021 17:27:44 -0500 Subject: [PATCH 032/181] Fixed bugs with zero local atoms (similar to what has been done to PPPM interp) --- lib/gpu/lal_amoeba.cpp | 28 +++++++++++++++++++--------- lib/gpu/lal_pppm.cpp | 6 ++++-- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 8adabbe6d5..6bf93a3eb5 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -128,14 +128,18 @@ double AmoebaT::host_memory_usage() const { // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + int _nall=this->atom->nall(); int nbor_pitch=this->nbor->nbor_pitch(); - int ainum=this->ans->inum(); // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); + this->time_pair.start(); // Build the short neighbor list if not done yet if (!this->short_nbor_avail) { @@ -164,14 +168,17 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int AmoebaT::umutual2b(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); - - int _nall=this->atom->nall(); - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); // Build the short neighbor list if not done yet @@ -200,14 +207,17 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int AmoebaT::polar_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); - - int _nall=this->atom->nall(); - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp index 6e8fe237a6..87ab6fe775 100644 --- a/lib/gpu/lal_pppm.cpp +++ b/lib/gpu/lal_pppm.cpp @@ -342,13 +342,15 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) { vd_brick.update_device(true); time_in.stop(); + int ainum=this->ans->inum(); + if (ainum==0) + return; + time_interp.start(); // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - int ainum=this->ans->inum(); - k_interp.set_size(GX,BX); k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff, &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv, From 6293da766142ce0da03be8b0ebc39027c17354b6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 16 Sep 2021 17:30:56 -0500 Subject: [PATCH 033/181] Cleaned up a bit --- src/GPU/pair_amoeba_gpu.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index f4ead3c5fa..e636e824d3 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -169,7 +169,6 @@ void PairAmoebaGPU::polar_real() host_start, &ilist, &numneigh, cpu_time, success, felec, off2, atom->q, domain->boxlo, domain->prd, &tep_pinned); - if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -281,12 +280,6 @@ void PairAmoebaGPU::init_style() } } - // select the squared cutoff (off2) for neighbor list builds (the polar term for now) - // NOTE: induce and polar terms are using the same flags here -/* - if (use_ewald) choose(POLAR_LONG); - else choose(POLAR); -*/ double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; @@ -298,11 +291,6 @@ void PairAmoebaGPU::init_style() int tep_size; int mnf = 5e-2 * neighbor->oneatom; - - // set the energy unit conversion factor for polar real-space calculation - - double felec = 0.5 * electric / am_dielectric; - int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, special_polar_wscale, special_polar_piscale, special_polar_pscale, atom->nlocal, From 003bebd31e60118295b38687627b6108cf4f4b4d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 17 Sep 2021 01:19:33 -0500 Subject: [PATCH 034/181] Working on the multipole real-space term, not ready yet --- lib/gpu/lal_amoeba.cpp | 51 +++++- lib/gpu/lal_amoeba.cu | 343 ++++++++++++++++++++++++++++++++++++ lib/gpu/lal_amoeba.h | 2 + lib/gpu/lal_amoeba_ext.cpp | 34 +++- lib/gpu/lal_base_amoeba.cpp | 95 +++++++++- lib/gpu/lal_base_amoeba.h | 25 ++- src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 332 ++++++++++++++++++++-------------- src/GPU/pair_amoeba_gpu.h | 6 +- 9 files changed, 729 insertions(+), 161 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 6bf93a3eb5..60bc365d12 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -45,7 +45,8 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const { template int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_dirdamp, + const double *host_thole, const double *host_dirdamp, + const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -57,8 +58,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, - "k_amoeba_polar", "k_amoeba_udirect2b", - "k_amoeba_umutual2b", "k_amoeba_short_nbor"); + "k_amoeba_multipole", "k_amoeba_udirect2b", + "k_amoeba_umutual2b", "k_amoeba_polar", + "k_amoeba_short_nbor"); if (success!=0) return success; @@ -91,7 +93,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda dview[i].x=host_special_polar_wscale[i]; dview[i].y=host_special_polar_piscale[i]; dview[i].z=host_special_polar_pscale[i]; - dview[i].w=(numtyp)0; + dview[i].w=host_special_mpole[i]; } ucl_copy(sp_polar,dview,5,false); @@ -123,6 +125,47 @@ double AmoebaT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Amoeba); } +// --------------------------------------------------------------------------- +// Calculate the polar real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::multipole_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } + + this->k_multipole.set_size(GX,BX); + this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + return GX; +} + // --------------------------------------------------------------------------- // Calculate the real-space permanent field, returning field and fieldp // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index f640690109..375592e338 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -44,6 +44,27 @@ _texture( q_tex,int2); #define local_allocate_store_ufld() \ __local acctyp red_acc[6][BLOCK_PAIR]; +#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ + tep) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=tq.x; \ + red_acc[1][tid]=tq.y; \ + red_acc[2][tid]=tq.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + tq.x=red_acc[0][tid]; \ + tq.y=red_acc[1][tid]; \ + tq.z=red_acc[2][tid]; \ + } \ + if (offset==0 && ii1) { \ @@ -130,6 +151,19 @@ _texture( q_tex,int2); #define local_allocate_store_ufld() +#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ + tep) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + tq.x += shfl_down(tq.x, s, t_per_atom); \ + tq.y += shfl_down(tq.y, s, t_per_atom); \ + tq.z += shfl_down(tq.z, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ @@ -185,6 +219,315 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + multipole_real = real-space portion of multipole + adapted from Tinker emreal1d() routine +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global numtyp4 *restrict tep, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp felec, + const numtyp off2, const numtyp polar_dscale, + const numtyp polar_uscale) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_ufld(); + local_allocate_store_charge(); + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int l=0; l<6; l++) virial[l]=(acctyp)0; + } + + acctyp4 tq; + tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.w=(acctyp)0; + + numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + //numtyp4 xi__; + + if (iioff2) continue; + + numtyp r = ucl_sqrt(r2); + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)]; + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + numtyp dik = dix*dkx + diy*dky + diz*dkz; + numtyp qik = qix*qkx + qiy*qky + qiz*qkz; + numtyp diqk = dix*qkx + diy*qky + diz*qkz; + numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; + numtyp qiqk = (numtyp )2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + qixx*qkxx + qiyy*qkyy + qizz*qkzz; + + // additional intermediates involving moments and distance + + numtyp dirx = diy*zr - diz*yr; + numtyp diry = diz*xr - dix*zr; + numtyp dirz = dix*yr - diy*xr; + numtyp dkrx = dky*zr - dkz*yr; + numtyp dkry = dkz*xr - dkx*zr; + numtyp dkrz = dkx*yr - dky*xr; + numtyp dikx = diy*dkz - diz*dky; + numtyp diky = diz*dkx - dix*dkz; + numtyp dikz = dix*dky - diy*dkx; + numtyp qirx = qiz*yr - qiy*zr; + numtyp qiry = qix*zr - qiz*xr; + numtyp qirz = qiy*xr - qix*yr; + numtyp qkrx = qkz*yr - qky*zr; + numtyp qkry = qkx*zr - qkz*xr; + numtyp qkrz = qky*xr - qkx*yr; + numtyp qikx = qky*qiz - qkz*qiy; + numtyp qiky = qkz*qix - qkx*qiz; + numtyp qikz = qkx*qiy - qky*qix; + numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz; + numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz; + numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz; + numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz; + numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz; + numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz; + numtyp qikrx = qizk*yr - qiyk*zr; + numtyp qikry = qixk*zr - qizk*xr; + numtyp qikrz = qiyk*xr - qixk*yr; + numtyp qkirx = qkzi*yr - qkyi*zr; + numtyp qkiry = qkxi*zr - qkzi*xr; + numtyp qkirz = qkyi*xr - qkxi*yr; + numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz; + numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz; + numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz; + numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz; + numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz; + numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz; + numtyp diqkrx = diqkz*yr - diqky*zr; + numtyp diqkry = diqkx*zr - diqkz*xr; + numtyp diqkrz = diqky*xr - diqkx*yr; + numtyp dkqirx = dkqiz*yr - dkqiy*zr; + numtyp dkqiry = dkqix*zr - dkqiz*xr; + numtyp dkqirz = dkqiy*xr - dkqix*yr; + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); + + // get reciprocal distance terms for this interaction + + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = felec * rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + numtyp rr9 = (numtyp)7.0 * rr7 * r2inv; + numtyp rr11 = (numtyp)9.0 * rr9 * r2inv; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; + numtyp alsq2n = (numtyp)0.0; + if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m < 6; m++) { + bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2; + } + for (m = 0; m < 6; m++) bn[m] *= felec; + + term1 = ci*ck; + term2 = ck*dir - ci*dkr + dik; + term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk); + term4 = dir*qkr - dkr*qir - 4.0*qik; + term5 = qir*qkr; + + numtyp scalek = 1.0 - factor_mpole; + rr1 = bn[0] - scalek*rr1; + rr3 = bn[1] - scalek*rr3; + rr5 = bn[2] - scalek*rr5; + rr7 = bn[3] - scalek*rr7; + rr9 = bn[4] - scalek*rr9; + rr11 = bn[5] - scalek*rr11; + numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9; + + // find standard multipole intermediates for force and torque + + numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11; + term1 = -ck*rr3 + dkr*rr5 - qkr*rr7; + term2 = ci*rr3 + dir*rr5 + qir*rr7; + term3 = (numtyp)2.0 * rr5; + term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); + term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9); + term6 = (numtyp)4.0 * rr7; + + energy += e; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + + // compute the torque components for this interaction + + numtyp tix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp tiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp tiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + + // increment force-based gradient and torque on first site + + f.x += frcx; + f.y += frcy; + f.z += frcz; + tq.x += tix; + tq.y += tiy; + tq.z += tiz; + + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp )-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp )-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp )-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vxy; + virial[4] += vxz; + virial[5] += vyz; + } + } // nbor + + } // ii { * - -5 Double precision is not supported on card **/ int init(const int ntypes, const int max_amtype, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, + const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -79,6 +80,7 @@ class Amoeba : public BaseAmoeba { protected: bool _allocated; + int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); int polar_real(const int eflag, const int vflag); diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index bbebaa09da..3e1fbe47b6 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -30,6 +30,7 @@ static Amoeba AMOEBAMF; int amoeba_gpu_init(const int ntypes, const int max_amtype, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, + const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -63,10 +64,10 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, int init_ok=0; if (world_me==0) init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, - host_special_polar_wscale, host_special_polar_piscale, - host_special_polar_pscale, nlocal, nall, max_nbors, - maxspecial, maxspecial15, cell_size, gpu_split, screen, - aewald, polar_dscale, polar_uscale); + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + nlocal, nall, max_nbors, maxspecial, maxspecial15, + cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale); AMOEBAMF.device->world_barrier(); if (message) @@ -83,10 +84,10 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, } if (gpu_rank==i && world_me!=0) init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, - host_special_polar_wscale, host_special_polar_piscale, - host_special_polar_pscale, nlocal, nall, max_nbors, - maxspecial, maxspecial15, cell_size, gpu_split, screen, - aewald, polar_dscale, polar_uscale); + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + nlocal, nall, max_nbors, maxspecial, maxspecial15, + cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale); AMOEBAMF.device->gpu_barrier(); if (message) @@ -104,6 +105,23 @@ void amoeba_gpu_clear() { AMOEBAMF.clear(); } +int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr); +} + int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 2fe0e1e4b8..585061e095 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -36,9 +36,10 @@ template BaseAmoebaT::~BaseAmoeba() { delete ans; delete nbor; - k_polar.clear(); + k_multipole.clear(); k_udirect2b.clear(); k_umutual2b.clear(); + k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); if (pair_program) delete pair_program; @@ -56,9 +57,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name_polar, + const char *k_name_multipole, const char *k_name_udirect2b, const char *k_name_umutual2b, + const char *k_name_polar, const char *k_name_short_nbor) { screen=_screen; @@ -91,8 +93,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b, - k_name_umutual2b,k_name_short_nbor); + compile_kernels(*ucl_device,pair_program,k_name_multipole,k_name_udirect2b, + k_name_umutual2b,k_name_polar,k_name_short_nbor); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -425,6 +427,85 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall return nbor->host_jlist.begin()-host_start; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double felec, const double off2_mpole, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>_max_tep_size) { + _max_tep_size=static_cast(static_cast(inum_full)*1.10); + _tep.resize(_max_tep_size*4); + } + *tep_ptr=_tep.host.begin(); + + _off2_mpole = off2_mpole; + _felec = felec; + const int red_blocks=multipole_real(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + // copy tep from device to host + + _tep.update_host(_max_tep_size*4,false); +/* + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_tep[4*i]); + printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute the direct real space part // of the permanent field @@ -713,9 +794,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname_polar, + const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, + const char *kname_polar, const char *kname_short_nbor) { if (_compiled) return; @@ -725,9 +807,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, std::string oclstring = device->compile_string()+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); - k_polar.set_function(*pair_program,kname_polar); + k_multipole.set_function(*pair_program,kname_multipole); k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); + k_polar.set_function(*pair_program,kname_polar); k_short_nbor.set_function(*pair_program,kname_short_nbor); k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index b14a234e7b..1762f156d3 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -54,8 +54,9 @@ class BaseAmoeba { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, - const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b, const char *kname_short_nbor); + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_short_nbor); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -141,6 +142,18 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); + /// Compute multipole real-space with device neighboring + int** compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double felec, const double off2_mpole, double *charge, + double *boxlo, double *prd, void **tep_ptr); + /// Compute the real space part of the permanent field (udirect2b) with device neighboring int** compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -241,7 +254,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_special15; UCL_Kernel k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -262,9 +275,11 @@ class BaseAmoeba { numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b, const char *kname_short_nbor); + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_short_nbor); + virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int polar_real(const int eflag, const int vflag) = 0; diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index b2318d296e..72c142888e 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -352,7 +352,7 @@ class PairAmoeba : public Pair { void dispersion_kspace(); void multipole(); - void multipole_real(); + virtual void multipole_real(); void multipole_kspace(); void polar(); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index e636e824d3..30b35919c1 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -53,6 +53,7 @@ enum{GORDON1,GORDON2}; int amoeba_gpu_init(const int ntypes, const int max_amtype, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, + const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -63,6 +64,15 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const double polar_uscale, int& tep_size); void amoeba_gpu_clear(); +int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double felec, const double off2, double *host_q, + double *boxlo, double *prd, void **tep_ptr); + int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -90,7 +100,7 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double off2, const double felec, double *host_q, + bool &success, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr); double amoeba_gpu_bytes(); @@ -106,6 +116,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) fieldp_pinned = nullptr; tep_pinned = nullptr; + gpu_multipole_real_ready = false; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; @@ -122,139 +133,6 @@ PairAmoebaGPU::~PairAmoebaGPU() amoeba_gpu_clear(); } -/* ---------------------------------------------------------------------- */ - -void PairAmoebaGPU::polar_real() -{ - if (!gpu_polar_real_ready) { - PairAmoeba::polar_real(); - return; - } - - int eflag=1, vflag=1; - int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; - - double sublo[3],subhi[3]; - if (domain->triclinic == 0) { - sublo[0] = domain->sublo[0]; - sublo[1] = domain->sublo[1]; - sublo[2] = domain->sublo[2]; - subhi[0] = domain->subhi[0]; - subhi[1] = domain->subhi[1]; - subhi[2] = domain->subhi[2]; - } else { - domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); - } - inum = atom->nlocal; - - // select the correct cutoff for the term - - if (use_ewald) choose(POLAR_LONG); - else choose(POLAR); - - // set the energy unit conversion factor for polar real-space calculation - - double felec = 0.5 * electric / am_dielectric; - - firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, - rpole, uind, uinp, sublo, subhi, - atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, felec, off2, atom->q, domain->boxlo, - domain->prd, &tep_pinned); - - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); - - // reference to the tep array from GPU lib - - if (tep_single) { - float *tep_ptr = (float *)tep_pinned; - compute_force_from_tep(tep_ptr); - } else { - double *tep_ptr = (double *)tep_pinned; - compute_force_from_tep(tep_ptr); - } -} - -/* ---------------------------------------------------------------------- - init specific to this pair style -------------------------------------------------------------------------- */ - -template -void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr) -{ - int i,ix,iy,iz; - double ci,dix,diy,diz; - double qixx,qixy,qixz; - double qiyy,qiyz,qizz; - double xix,yix,zix; - double xiy,yiy,ziy; - double xiz,yiz,ziz; - double vxx,vyy,vzz; - double vxy,vxz,vyz; - double fix[3],fiy[3],fiz[3],tep[4]; - - double** x = atom->x; - int nlocal = atom->nlocal; - - for (i = 0; i < nlocal; i++) { - dix = rpole[i][1]; - diy = rpole[i][2]; - diz = rpole[i][3]; - qixx = rpole[i][4]; - qixy = rpole[i][5]; - qixz = rpole[i][6]; - qiyy = rpole[i][8]; - qiyz = rpole[i][9]; - qizz = rpole[i][12]; - - tep[0] = tep_ptr[4*i]; - tep[1] = tep_ptr[4*i+1]; - tep[2] = tep_ptr[4*i+2]; - - torque2force(i,tep,fix,fiy,fiz,fpolar); - - iz = zaxis2local[i]; - ix = xaxis2local[i]; - iy = yaxis2local[i]; - - xiz = x[iz][0] - x[i][0]; - yiz = x[iz][1] - x[i][1]; - ziz = x[iz][2] - x[i][2]; - xix = x[ix][0] - x[i][0]; - yix = x[ix][1] - x[i][1]; - zix = x[ix][2] - x[i][2]; - xiy = x[iy][0] - x[i][0]; - yiy = x[iy][1] - x[i][1]; - ziy = x[iy][2] - x[i][2]; - - vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; - vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; - vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; - vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + - xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); - vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + - xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); - vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + - yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - - virpolar[0] += vxx; - virpolar[1] += vyy; - virpolar[2] += vzz; - virpolar[3] += vxy; - virpolar[4] += vxz; - virpolar[5] += vyz; - } -} - /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ @@ -292,7 +170,7 @@ void PairAmoebaGPU::init_style() int tep_size; int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, - special_polar_wscale, special_polar_piscale, + special_mpole, special_polar_wscale, special_polar_piscale, special_polar_pscale, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, @@ -308,6 +186,68 @@ void PairAmoebaGPU::init_style() tep_single = true; } +/* ---------------------------------------------------------------------- */ + +void PairAmoebaGPU::multipole_real() +{ + if (!gpu_multipole_real_ready) { + PairAmoeba::multipole_real(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + // set the energy unit conversion factor for multipolar real-space calculation + + double felec = electric / am_dielectric; + + firstneigh = amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, felec, off2, atom->q, domain->boxlo, + domain->prd, &tep_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (tep_single) { + float *tep_ptr = (float *)tep_pinned; + compute_force_from_tep(tep_ptr, fmpole, virmpole); + } else { + double *tep_ptr = (double *)tep_pinned; + compute_force_from_tep(tep_ptr, fmpole, virmpole); + } +} + /* ---------------------------------------------------------------------- induce = induced dipole moments via pre-conditioned CG solver adapted from Tinker induce0a() routine @@ -1041,6 +981,128 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) /* ---------------------------------------------------------------------- */ +void PairAmoebaGPU::polar_real() +{ + if (!gpu_polar_real_ready) { + PairAmoeba::polar_real(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + + firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, + rpole, uind, uinp, sublo, subhi, + atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, felec, off2, atom->q, domain->boxlo, + domain->prd, &tep_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (tep_single) { + float *tep_ptr = (float *)tep_pinned; + compute_force_from_tep(tep_ptr, fpolar, virpolar); + } else { + double *tep_ptr = (double *)tep_pinned; + compute_force_from_tep(tep_ptr, fpolar, virpolar); + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +template +void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, + double** force_comp, + double* virial_comp) +{ + int i,ix,iy,iz; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double fix[3],fiy[3],fiz[3],tep[4]; + + double** x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + tep[0] = tep_ptr[4*i]; + tep[1] = tep_ptr[4*i+1]; + tep[2] = tep_ptr[4*i+2]; + + torque2force(i,tep,fix,fiy,fiz,force_comp); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + + virial_comp[0] += vxx; + virial_comp[1] += vyy; + virial_comp[2] += vzz; + virial_comp[3] += vxy; + virial_comp[4] += vxz; + virial_comp[5] += vyz; + } +} + +/* ---------------------------------------------------------------------- */ + double PairAmoebaGPU::memory_usage() { double bytes = Pair::memory_usage(); diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 4dc547e469..a913449a62 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -35,9 +35,10 @@ class PairAmoebaGPU : public PairAmoeba { virtual void induce(); - virtual void polar_real(); + virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual2b(double **, double **); + virtual void polar_real(); private: int gpu_mode; @@ -46,6 +47,7 @@ class PairAmoebaGPU : public PairAmoeba { void *fieldp_pinned; bool tep_single; + bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; bool gpu_umutual2b_ready; bool gpu_polar_real_ready; @@ -53,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba { void udirect2b_cpu(); template - void compute_force_from_tep(const numtyp*); + void compute_force_from_tep(const numtyp*, double**, double*); }; } // namespace LAMMPS_NS From d9267059505be34d3a7c9469b6b7e071eb2b6219 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 17 Sep 2021 01:32:00 -0500 Subject: [PATCH 035/181] Short neighbor list for multipole real-space should be built with off2_mpole --- lib/gpu/lal_amoeba.cpp | 17 ++++++++--------- lib/gpu/lal_amoeba.cu | 2 +- src/GPU/pair_amoeba_gpu.cpp | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 60bc365d12..d109c98c42 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -143,15 +143,14 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list if not done yet - if (!this->short_nbor_avail) { - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &this->_off2_polar, &ainum, - &nbor_pitch, &this->_threads_per_atom); - this->short_nbor_avail = true; - } + // Build the short neighbor list for the cutoff off2_mpole, + // at this point mpole is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_mpole, &ainum, + &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 375592e338..e14cb99328 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -826,7 +826,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp zr = jx.z - ix.z; numtyp r2 = xr*xr + yr*yr + zr*zr; - if (r2>off2) continue; + //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 30b35919c1..67c9d6109f 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -116,7 +116,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) fieldp_pinned = nullptr; tep_pinned = nullptr; - gpu_multipole_real_ready = false; + gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; From 2e6df83b9b7979ed2f2a79591a18c520ff6e94fc Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 17 Sep 2021 15:24:36 -0500 Subject: [PATCH 036/181] Fixed bugs in the multipole real-space part on the GPU; separately multipole real and polar real work correctly (along with udirect2b and umutual2b), but together they are conflicting due to the use of ans to copy forces back from device to host. The other 2 kernels (induce part) do not touch forces and energies. --- lib/gpu/lal_amoeba.cpp | 12 +++---- lib/gpu/lal_amoeba.cu | 62 +++++++++++++++------------------ lib/gpu/lal_amoeba.h | 5 ++- lib/gpu/lal_amoeba_ext.cpp | 24 ++++++------- lib/gpu/lal_base_amoeba.cpp | 23 +++++++----- lib/gpu/lal_base_amoeba.h | 15 ++++---- src/AMOEBA/amoeba_multipole.cpp | 30 +++++++++++----- src/AMOEBA/amoeba_polar.cpp | 8 +++-- src/AMOEBA/pair_amoeba.cpp | 3 ++ src/GPU/pair_amoeba_gpu.cpp | 45 ++++++++++++------------ 10 files changed, 123 insertions(+), 104 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index d109c98c42..af71decb86 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -53,8 +53,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, - const double aewald, const double polar_dscale, - const double polar_uscale) { + const double polar_dscale, const double polar_uscale) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, @@ -97,7 +96,6 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda } ucl_copy(sp_polar,dview,5,false); - _aewald = aewald; _polar_dscale = polar_dscale; _polar_uscale = polar_uscale; @@ -158,7 +156,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &_aewald, &this->_felec, + &this->_threads_per_atom, &this->_aewald, &this->_felec, &this->_off2_mpole, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); @@ -198,7 +196,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &_aewald, &this->_off2_polar, + &this->_threads_per_atom, &this->_aewald, &this->_off2_polar, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); @@ -237,7 +235,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, - &nbor_pitch, &this->_threads_per_atom, &_aewald, + &nbor_pitch, &this->_threads_per_atom, &this->_aewald, &this->_off2_polar, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); @@ -278,7 +276,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &_aewald, &this->_felec, + &this->_threads_per_atom, &this->_aewald, &this->_felec, &this->_off2_polar, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index e14cb99328..910316d289 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -225,20 +225,20 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict damping, - const __global numtyp4 *restrict sp_polar, - const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - __global numtyp4 *restrict tep, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp felec, - const numtyp off2, const numtyp polar_dscale, - const numtyp polar_uscale) + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global numtyp4 *restrict tep, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp felec, + const numtyp off2, const numtyp polar_dscale, + const numtyp polar_uscale) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -257,7 +257,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, } acctyp4 tq; - tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.w=(acctyp)0; + tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; numtyp4* polar1 = (numtyp4*)(&extra[0]); @@ -272,7 +272,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp term1,term2,term3; numtyp term4,term5; numtyp term6,term7; - numtyp rc3[3],rc5[3],rc7[3]; numtyp bn[6]; numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp; @@ -309,9 +308,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, // debug: // xi__ = ix; xi__.w = itype; - numtyp pdi = damping[itype].x; - numtyp pti = damping[itype].y; - for ( ; nboroff2) continue; + if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - numtyp ck = polar1[j].x; // rpole[j][0]; numtyp dkx = polar1[j].y; // rpole[j][1]; numtyp dky = polar1[j].z; // rpole[j][2]; @@ -363,7 +358,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp qik = qix*qkx + qiy*qky + qiz*qkz; numtyp diqk = dix*qkx + diy*qky + diz*qkz; numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; - numtyp qiqk = (numtyp )2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + qixx*qkxx + qiyy*qkyy + qizz*qkzz; // additional intermediates involving moments and distance @@ -452,8 +447,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk); term4 = dir*qkr - dkr*qir - 4.0*qik; term5 = qir*qkr; - - numtyp scalek = 1.0 - factor_mpole; + numtyp scalek = (numtyp)1.0 - factor_mpole; rr1 = bn[0] - scalek*rr1; rr3 = bn[1] - scalek*rr3; rr5 = bn[2] - scalek*rr5; @@ -485,11 +479,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, // compute the torque components for this interaction - numtyp tix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - term4*qirx - term6*(qikrx+qikx); - numtyp tiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - term4*qiry - term6*(qikry+qiky); - numtyp tiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - term4*qirz - term6*(qikrz+qikz); // increment force-based gradient and torque on first site @@ -497,16 +491,16 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, f.x += frcx; f.y += frcy; f.z += frcz; - tq.x += tix; - tq.y += tiy; - tq.z += tiz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; if (EVFLAG && vflag) { numtyp vxx = -xr * frcx; - numtyp vxy = (numtyp )-0.5 * (yr*frcx+xr*frcy); - numtyp vxz = (numtyp )-0.5 * (zr*frcx+xr*frcz); + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); numtyp vyy = -yr * frcy; - numtyp vyz = (numtyp )-0.5 * (zr*frcy+yr*frcz); + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); numtyp vzz = -zr * frcz; virial[0] += vxx; @@ -520,7 +514,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, } // ii { const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, - const double aewald, const double polar_dscale, - const double polar_uscale); + const double polar_dscale, const double polar_uscale); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ @@ -75,7 +74,7 @@ class Amoeba : public BaseAmoeba { /// Number of atom types int _lj_types; - numtyp _aewald, _polar_dscale, _polar_uscale; + numtyp _polar_dscale, _polar_uscale; numtyp _qqrd2e; protected: diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 3e1fbe47b6..8493e9331d 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -37,8 +37,8 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double aewald, const double polar_dscale, - const double polar_uscale, int& tep_size) { + const double polar_dscale, const double polar_uscale, + int& tep_size) { AMOEBAMF.clear(); gpu_mode=AMOEBAMF.device->gpu_mode(); double gpu_split=AMOEBAMF.device->particle_split(); @@ -67,7 +67,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, - cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale); + cell_size, gpu_split, screen, polar_dscale, polar_uscale); AMOEBAMF.device->world_barrier(); if (message) @@ -87,7 +87,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, nlocal, nall, max_nbors, maxspecial, maxspecial15, - cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale); + cell_size, gpu_split, screen, polar_dscale, polar_uscale); AMOEBAMF.device->gpu_barrier(); if (message) @@ -113,13 +113,13 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double felec, const double off2, + bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr); + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); } int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, @@ -131,13 +131,13 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double off2, double *host_q, + bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr) { return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr); + cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); } int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full, @@ -149,13 +149,13 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double off2, double *host_q, + bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr) { return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr); + cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); } int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, @@ -167,13 +167,13 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double felec, const double off2, + bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr); + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); } double amoeba_gpu_bytes() { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 585061e095..3480ce55db 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -252,8 +252,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, const double off2_polar, const double felec, - double *host_q, const int nlocal, + bool &success, const double aewald, const double felec, + const double off2_polar, double *host_q, const int nlocal, double *boxlo, double *prd, void **tep_ptr) { acc_timers(); int eflag, vflag; @@ -440,7 +440,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double felec, const double off2_mpole, + bool &success, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { acc_timers(); int eflag, vflag; @@ -488,6 +488,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co _off2_mpole = off2_mpole; _felec = felec; + _aewald = aewald; const int red_blocks=multipole_real(eflag,vflag); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); @@ -521,8 +522,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double off2_polar, double *host_q, - double *boxlo, double *prd, void** fieldp_ptr) { + bool &success, const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -560,6 +561,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i *fieldp_ptr=_fieldp.host.begin(); _off2_polar = off2_polar; + _aewald = aewald; const int red_blocks=udirect2b(eflag,vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -591,8 +593,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double off2_polar, double *host_q, - double *boxlo, double *prd, void** fieldp_ptr) { + bool &success, const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -630,6 +632,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i *fieldp_ptr=_fieldp.host.begin(); _off2_polar = off2_polar; + _aewald = aewald; const int red_blocks=umutual2b(eflag,vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -660,8 +663,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double felec, const double off2_polar, - double *host_q, double *boxlo, double *prd, void **tep_ptr) { + bool &success, const double aewald, const double felec, + const double off2_polar, double *host_q, double *boxlo, + double *prd, void **tep_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -708,6 +712,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const _off2_polar = off2_polar; _felec = felec; + _aewald = aewald; const int red_blocks=polar_real(eflag,vflag); ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 1762f156d3..0b6c09742e 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -151,7 +151,7 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - const double felec, const double off2_mpole, double *charge, + const double aewald, const double felec, const double off2_mpole, double *charge, double *boxlo, double *prd, void **tep_ptr); /// Compute the real space part of the permanent field (udirect2b) with device neighboring @@ -165,7 +165,8 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2_polar, double *charge, + double *boxlo, double *prd, void **fieldp_ptr); /// Compute the real space part of the induced field (umutual2b) with device neighboring int** compute_umutual2b(const int ago, const int inum_full, const int nall, @@ -178,7 +179,8 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2_polar, double *charge, + double *boxlo, double *prd, void **fieldp_ptr); /// Compute polar real-space with device neighboring int** compute_polar_real(const int ago, const int inum_full, const int nall, @@ -190,7 +192,7 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - const double felec, const double off2_polar, double *charge, + const double aewald, const double felec, const double off2_polar, double *charge, double *boxlo, double *prd, void **tep_ptr); /// Compute polar real-space with host neighboring (not active for now) @@ -200,7 +202,7 @@ class BaseAmoeba { double **host_uinp, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, const double felec, const double off2_polar, + const double cpu_time, bool &success, const double aewald, const double felec, const double off2_polar, double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr); // -------------------------- DEVICE DATA ------------------------- @@ -272,7 +274,8 @@ class BaseAmoeba { bool short_nbor_avail; UCL_D_Vec *_nbor_data; - numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar; + numtyp _aewald,_felec; + numtyp _off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar; void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_multipole, const char *kname_udirect2b, diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index c06f07d70c..62255db6f2 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -369,6 +369,9 @@ void PairAmoeba::multipole_real() bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2; } for (k = 0; k < 6; k++) bn[k] *= felec; + //if (i == 0 && j < 10) { + // printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]); + //} // find damped multipole intermediates and energy value @@ -447,6 +450,10 @@ void PairAmoeba::multipole_real() rr9 = bn[4] - scalek*rr9; rr11 = bn[5] - scalek*rr11; e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9; + if (i == 0 && j < 10) { + //printf("j = %d: scalek = %f; rr11 = %f; terms: %f %f %f %f %f\n", j, scalek, rr11, term1, term2, term3, term4, term5); + //printf("j = %d: felec = %f; rr1 = %f; bn0 = %f\n", j, felec, rr1, bn[0]); + } // find standard multipole intermediates for force and torque @@ -457,6 +464,7 @@ void PairAmoeba::multipole_real() term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9); term6 = 4.0 * rr7; + } empole += e; @@ -515,16 +523,20 @@ void PairAmoeba::multipole_real() tq[i][0] += ttmi[0]; tq[i][1] += ttmi[1]; tq[i][2] += ttmi[2]; - + //if (i == 0 && j < 10) { + // printf("j = %d: erfc = %f; f: %f %f %f; tq = %f %f %f\n", j, erfc(ralpha), frcx, frcy, frcz, ttmi[0], ttmi[1], ttmi[2]); + //printf("j = %d: terms: %f %f %f; tq = %f %f %f\n", j, term1, term2, term3, qikrx, qikry, qikrz); + //} // increment force-based gradient and torque on second site - + // commenting out j parts for DEBUGGING + fmpole[j][0] -= frcx; fmpole[j][1] -= frcy; fmpole[j][2] -= frcz; tq[j][0] += ttmk[0]; tq[j][1] += ttmk[1]; tq[j][2] += ttmk[2]; - + // increment the virial due to pairwise Cartesian forces vxx = -xr * frcx; @@ -556,10 +568,11 @@ void PairAmoeba::multipole_real() comm->reverse_comm_pair(this); // resolve site torques then increment forces and virial - + printf("compute multipole real\n"); for (i = 0; i < nlocal; i++) { - torque2force(i,tq[i],fix,fiy,fiz,fmpole); - + if (i == 0) printf("before fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]); + torque2force(i,tq[i],fix,fiy,fiz,fmpole); + if (i == 0) printf("after fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]); iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; @@ -575,15 +588,16 @@ void PairAmoeba::multipole_real() ziy = x[iy][2] - x[i][2]; vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); - vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz); virmpole[0] += vxx; virmpole[1] += vyy; virmpole[2] += vzz; diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index 1503243220..659194ac0b 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -1176,7 +1176,7 @@ void PairAmoeba::polar_real() comm->reverse_comm_pair(this); // torque is induced field and gradient cross permanent moments - + printf("compute polar real\n"); for (i = 0; i < nlocal; i++) { dix = rpole[i][1]; diy = rpole[i][2]; @@ -1197,8 +1197,10 @@ void PairAmoeba::polar_real() qiyz*dufld[i][3] - qixz*dufld[i][4] + 2.0*qixy*(dufld[i][0]-dufld[i][2]) + (qiyy-qixx)*dufld[i][1]; + if (i == 0) printf("before fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]); torque2force(i,tep,fix,fiy,fiz,fpolar); - + if (i == 0) printf("after fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]); + iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; @@ -1222,7 +1224,7 @@ void PairAmoeba::polar_real() xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - + //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz); virpolar[0] += vxx; virpolar[1] += vyy; virpolar[2] += vzz; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index f9e098e884..5157739f0e 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -972,6 +972,9 @@ void PairAmoeba::init_style() // request neighbor lists int irequest = neighbor->request(this,instance_me); + // for DEBUGGING with GPU + //neighbor->requests[irequest]->half = 0; + //neighbor->requests[irequest]->full = 1; // open debug output files // names are hard-coded diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 67c9d6109f..d33b8d1431 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -60,8 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double aewald, const double polar_dscale, - const double polar_uscale, int& tep_size); + const double polar_dscale, const double polar_uscale, int& tep_size); void amoeba_gpu_clear(); int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, @@ -70,8 +69,8 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double felec, const double off2, double *host_q, - double *boxlo, double *prd, void **tep_ptr); + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr); int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, @@ -80,7 +79,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double off2, double *host_q, + bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr); int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, @@ -90,7 +89,7 @@ int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nal tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double off2, double *host_q, + bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr); int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, @@ -100,8 +99,8 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double felec, const double off2, double *host_q, - double *boxlo, double *prd, void **tep_ptr); + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr); double amoeba_gpu_bytes(); @@ -119,7 +118,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; - gpu_polar_real_ready = true; + gpu_polar_real_ready = false; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -174,7 +173,7 @@ void PairAmoebaGPU::init_style() special_polar_pscale, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, - aewald, polar_dscale, polar_uscale, tep_size); + polar_dscale, polar_uscale, tep_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) @@ -231,14 +230,14 @@ void PairAmoebaGPU::multipole_real() atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, felec, off2, atom->q, domain->boxlo, - domain->prd, &tep_pinned); + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tep_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // reference to the tep array from GPU lib - + printf("compute multipole real\n"); if (tep_single) { float *tep_ptr = (float *)tep_pinned; compute_force_from_tep(tep_ptr, fmpole, virmpole); @@ -727,7 +726,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, off2, atom->q, domain->boxlo, + success, aewald, off2, atom->q, domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -951,7 +950,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, off2, atom->q, domain->boxlo, + success,aewald, off2, atom->q, domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1008,7 +1007,7 @@ void PairAmoebaGPU::polar_real() } inum = atom->nlocal; - // select the correct cutoff for the term + // select the correct cutoff and aewald for the term if (use_ewald) choose(POLAR_LONG); else choose(POLAR); @@ -1024,14 +1023,14 @@ void PairAmoebaGPU::polar_real() atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, - success, felec, off2, atom->q, domain->boxlo, + success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tep_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // reference to the tep array from GPU lib - + printf("compute polar real\n"); if (tep_single) { float *tep_ptr = (float *)tep_pinned; compute_force_from_tep(tep_ptr, fpolar, virpolar); @@ -1066,7 +1065,9 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, tep[1] = tep_ptr[4*i+1]; tep[2] = tep_ptr[4*i+2]; + if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]); torque2force(i,tep,fix,fiy,fiz,force_comp); + if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]); iz = zaxis2local[i]; ix = xaxis2local[i]; @@ -1086,12 +1087,12 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + - xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + - xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + - yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz); virial_comp[0] += vxx; virial_comp[1] += vyy; virial_comp[2] += vzz; From f5713a52b34e168d725b9ca4a471b484f02596a2 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 17 Sep 2021 16:39:57 -0500 Subject: [PATCH 037/181] Added another kernel to accumulate forces, energies and virial on the device (similar to the tersoff kernels) as multiple kernels all added to those quantities; also only copy answers back to the host in the last kernel in a time step; cleaned up debugging messages --- lib/gpu/lal_amoeba.cu | 197 ++++++++++++++++++++++++++++++-- lib/gpu/lal_base_amoeba.cpp | 11 +- lib/gpu/lal_tersoff.cu | 6 +- src/AMOEBA/amoeba_multipole.cpp | 11 +- src/AMOEBA/amoeba_polar.cpp | 4 +- src/GPU/pair_amoeba_gpu.cpp | 2 +- 6 files changed, 204 insertions(+), 27 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 910316d289..49c0d78d7f 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -147,6 +147,70 @@ _texture( q_tex,int2); fieldp[ii+inum] = fp; \ } +#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv, ev_stride) \ + if (t_per_atom>1) { \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ + } \ + } \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]+=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && iicopy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -490,8 +491,11 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co _felec = felec; _aewald = aewald; const int red_blocks=multipole_real(eflag,vflag); - ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - device->add_ans_object(ans); + + // leave the answers (forces, energies and virial) on the device, only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + hd_balancer.stop_timer(); // copy tep from device to host @@ -714,8 +718,11 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const _felec = felec; _aewald = aewald; const int red_blocks=polar_real(eflag,vflag); + + // only copy answers (forces, energies and virial) back from the device in the last kernel (which is polar_real here) ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); + hd_balancer.stop_timer(); // copy tep from device to host diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index 8baa5ce12a..feab8bb5c0 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4); } \ } +// (SHUFFLE_AVAIL == 1) #else #define local_allocate_acc_zeta() @@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4); } \ } +// EVFLAG == 0 #else #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, \ @@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4); ans[ii]=old; \ } -#endif -#endif +#endif // EVFLAG +#endif // SHUFFLE_AVAIL #ifdef LAL_SIMD_IP_SYNC #define t_per_atom t_per_atom_in diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index 62255db6f2..3f5c9082e7 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -523,10 +523,7 @@ void PairAmoeba::multipole_real() tq[i][0] += ttmi[0]; tq[i][1] += ttmi[1]; tq[i][2] += ttmi[2]; - //if (i == 0 && j < 10) { - // printf("j = %d: erfc = %f; f: %f %f %f; tq = %f %f %f\n", j, erfc(ralpha), frcx, frcy, frcz, ttmi[0], ttmi[1], ttmi[2]); - //printf("j = %d: terms: %f %f %f; tq = %f %f %f\n", j, term1, term2, term3, qikrx, qikry, qikrz); - //} + // increment force-based gradient and torque on second site // commenting out j parts for DEBUGGING @@ -568,11 +565,10 @@ void PairAmoeba::multipole_real() comm->reverse_comm_pair(this); // resolve site torques then increment forces and virial - printf("compute multipole real\n"); + for (i = 0; i < nlocal; i++) { - if (i == 0) printf("before fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]); torque2force(i,tq[i],fix,fiy,fiz,fmpole); - if (i == 0) printf("after fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]); + iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; @@ -597,7 +593,6 @@ void PairAmoeba::multipole_real() vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz); virmpole[0] += vxx; virmpole[1] += vyy; virmpole[2] += vzz; diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index 659194ac0b..f4acf3e7a8 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -1176,7 +1176,7 @@ void PairAmoeba::polar_real() comm->reverse_comm_pair(this); // torque is induced field and gradient cross permanent moments - printf("compute polar real\n"); + for (i = 0; i < nlocal; i++) { dix = rpole[i][1]; diy = rpole[i][2]; @@ -1197,9 +1197,7 @@ void PairAmoeba::polar_real() qiyz*dufld[i][3] - qixz*dufld[i][4] + 2.0*qixy*(dufld[i][0]-dufld[i][2]) + (qiyy-qixx)*dufld[i][1]; - if (i == 0) printf("before fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]); torque2force(i,tep,fix,fiy,fiz,fpolar); - if (i == 0) printf("after fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]); iz = zaxis2local[i]; ix = xaxis2local[i]; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index d33b8d1431..dcf7d95047 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -118,7 +118,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; - gpu_polar_real_ready = false; + gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } From 78045d8f7621cc12cb60beb4fd9d9008bb1c65e3 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 17 Sep 2021 23:13:51 -0500 Subject: [PATCH 038/181] Cleaned up debugging stuffs and unused variables --- lib/gpu/lal_amoeba.cu | 68 +++++++++------------ src/GPU/pair_amoeba_gpu.cpp | 116 ++++++++++++++++++------------------ src/GPU/pair_amoeba_gpu.h | 6 +- 3 files changed, 89 insertions(+), 101 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 49c0d78d7f..41185f30e3 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -62,7 +62,7 @@ _texture( q_tex,int2); tq.z=red_acc[2][tid]; \ } \ if (offset==0 && ii1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ if (EVFLAG && (vflag==2 || eflag==2)) { \ if (eflag) { \ simdsync(); \ - simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ if (vflag) { \ simdsync(); \ @@ -174,7 +174,7 @@ _texture( q_tex,int2); if (eflag!=2 && vflag!=2) { \ if (eflag) { \ simdsync(); \ - block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \ + block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \ if (vflag) __syncthreads(); \ if (tid==0) { \ engv[ei]+=energy*(acctyp)0.5; \ @@ -225,7 +225,7 @@ _texture( q_tex,int2); } \ } \ if (offset==0 && ii1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (vflag==2 || eflag==2) { \ if (eflag) \ - simd_reduce_add2(t_per_atom,energy,e_coul); \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ if (vflag) \ simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ @@ -402,20 +400,20 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict damping, - const __global numtyp4 *restrict sp_polar, - const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - __global numtyp4 *restrict tep, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp felec, - const numtyp off2, const numtyp polar_dscale, - const numtyp polar_uscale) + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global numtyp4 *restrict tep, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + const int t_per_atom, const numtyp aewald, + const numtyp felec, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -439,14 +437,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - //numtyp4 xi__; - if (iioff2) continue; + //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); numtyp ck = polar1[j].x; // rpole[j][0]; @@ -613,14 +605,14 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, for (m = 1; m < 6; m++) { bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; - bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; } for (m = 0; m < 6; m++) bn[m] *= felec; term1 = ci*ck; term2 = ck*dir - ci*dkr + dik; - term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk); - term4 = dir*qkr - dkr*qir - 4.0*qik; + term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); + term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik; term5 = qir*qkr; numtyp scalek = (numtyp)1.0 - factor_mpole; rr1 = bn[0] - scalek*rr1; @@ -730,8 +722,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - //numtyp4 xi__; - if (iimaxspecial15; } - int tep_size; + int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, special_mpole, special_polar_wscale, special_polar_piscale, special_polar_pscale, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, - polar_dscale, polar_uscale, tep_size); + polar_dscale, polar_uscale, tq_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); - if (tep_size == sizeof(double)) - tep_single = false; + if (tq_size == sizeof(double)) + tq_single = false; else - tep_single = true; + tq_single = true; } /* ---------------------------------------------------------------------- */ @@ -231,19 +231,19 @@ void PairAmoebaGPU::multipole_real() eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tep_pinned); + domain->boxlo, domain->prd, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // reference to the tep array from GPU lib - printf("compute multipole real\n"); - if (tep_single) { - float *tep_ptr = (float *)tep_pinned; - compute_force_from_tep(tep_ptr, fmpole, virmpole); + + if (tq_single) { + float *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, fmpole, virmpole); } else { - double *tep_ptr = (double *)tep_pinned; - compute_force_from_tep(tep_ptr, fmpole, virmpole); + double *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, fmpole, virmpole); } } @@ -681,7 +681,6 @@ void PairAmoebaGPU::induce() } } - /* ---------------------------------------------------------------------- udirect2b = Ewald real direct field via list udirect2b computes the real space contribution of the permanent @@ -721,19 +720,20 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) else choose(POLAR); firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, uind, uinp, - sublo, subhi, atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, domain->boxlo, - domain->prd, &fieldp_pinned); + atom->type, amtype, amgroup, rpole, + uind, uinp, sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs - + // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() // udirect2b_cpu(); // accumulate the field and fieldp values from the GPU lib @@ -945,13 +945,14 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) else choose(POLAR); firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, uind, uinp, - sublo, subhi, atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success,aewald, off2, atom->q, domain->boxlo, - domain->prd, &fieldp_pinned); + atom->type, amtype, amgroup, rpole, + uind, uinp, sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success,aewald, off2, atom->q, + domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1017,37 +1018,37 @@ void PairAmoebaGPU::polar_real() double felec = 0.5 * electric / am_dielectric; firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, - rpole, uind, uinp, sublo, subhi, - atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, domain->boxlo, - domain->prd, &tep_pinned); + atom->type, amtype, amgroup, + rpole, uind, uinp, sublo, subhi, + atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // reference to the tep array from GPU lib - printf("compute polar real\n"); - if (tep_single) { - float *tep_ptr = (float *)tep_pinned; - compute_force_from_tep(tep_ptr, fpolar, virpolar); + + if (tq_single) { + float *tep_ptr = (float *)tq_pinned; + compute_force_from_torque(tep_ptr, fpolar, virpolar); } else { - double *tep_ptr = (double *)tep_pinned; - compute_force_from_tep(tep_ptr, fpolar, virpolar); + double *tep_ptr = (double *)tq_pinned; + compute_force_from_torque(tep_ptr, fpolar, virpolar); } } /* ---------------------------------------------------------------------- - init specific to this pair style + compute atom forces from torques ------------------------------------------------------------------------- */ template -void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, - double** force_comp, - double* virial_comp) +void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, + double** force_comp, + double* virial_comp) { int i,ix,iy,iz; double xix,yix,zix; @@ -1055,19 +1056,16 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, double xiz,yiz,ziz; double vxx,vyy,vzz; double vxy,vxz,vyz; - double fix[3],fiy[3],fiz[3],tep[4]; + double fix[3],fiy[3],fiz[3],_tq[4]; double** x = atom->x; int nlocal = atom->nlocal; for (i = 0; i < nlocal; i++) { - tep[0] = tep_ptr[4*i]; - tep[1] = tep_ptr[4*i+1]; - tep[2] = tep_ptr[4*i+2]; - - if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]); - torque2force(i,tep,fix,fiy,fiz,force_comp); - if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]); + _tq[0] = tq_ptr[4*i]; + _tq[1] = tq_ptr[4*i+1]; + _tq[2] = tq_ptr[4*i+2]; + torque2force(i,_tq,fix,fiy,fiz,force_comp); iz = zaxis2local[i]; ix = xaxis2local[i]; @@ -1092,7 +1090,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr, xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz); + virial_comp[0] += vxx; virial_comp[1] += vyy; virial_comp[2] += vzz; diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index a913449a62..d9a3fc5904 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -43,9 +43,9 @@ class PairAmoebaGPU : public PairAmoeba { private: int gpu_mode; double cpu_time; - void *tep_pinned; + void *tq_pinned; void *fieldp_pinned; - bool tep_single; + bool tq_single; bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; @@ -55,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba { void udirect2b_cpu(); template - void compute_force_from_tep(const numtyp*, double**, double*); + void compute_force_from_torque(const numtyp*, double**, double*); }; } // namespace LAMMPS_NS From 5d801e985fd78e631c928cb15aaa85be1529ab98 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 17 Sep 2021 23:24:23 -0500 Subject: [PATCH 039/181] More cleanup --- lib/gpu/lal_amoeba.cpp | 14 ++-- lib/gpu/lal_base_amoeba.cpp | 125 ++++++++++++++++++++---------------- lib/gpu/lal_base_amoeba.h | 11 ++-- 3 files changed, 82 insertions(+), 68 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index af71decb86..d2f2b1bf79 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -182,13 +182,13 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { this->time_pair.start(); // Build the short neighbor list if not done yet - if (!this->short_nbor_avail) { + if (!this->short_nbor_polar_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_off2_polar, &ainum, &nbor_pitch, &this->_threads_per_atom); - this->short_nbor_avail = true; + this->short_nbor_polar_avail = true; } this->k_udirect2b.set_size(GX,BX); @@ -222,13 +222,13 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { this->time_pair.start(); // Build the short neighbor list if not done yet - if (!this->short_nbor_avail) { + if (!this->short_nbor_polar_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_off2_polar, &ainum, &nbor_pitch, &this->_threads_per_atom); - this->short_nbor_avail = true; + this->short_nbor_polar_avail = true; } this->k_umutual2b.set_size(GX,BX); @@ -261,13 +261,13 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { this->time_pair.start(); // Build the short neighbor list if not done yet - if (!this->short_nbor_avail) { + if (!this->short_nbor_polar_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_off2_polar, &ainum, &nbor_pitch, &this->_threads_per_atom); - this->short_nbor_avail = true; + this->short_nbor_polar_avail = true; } this->k_polar.set_size(GX,BX); @@ -283,7 +283,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { // Signal that short nbor list is not avail for the next time step // do it here because polar_real() is the last kernel in a time step at this point - this->short_nbor_avail = false; + this->short_nbor_polar_avail = false; return GX; } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index f70903c889..e777981912 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -21,7 +21,7 @@ namespace LAMMPS_AL { extern Device global_device; template -BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) { +BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); @@ -241,11 +241,12 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, } // --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. +// Copy nbor list from host if necessary and then calculate forces, virials +// for the polar real-space term // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, +void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, int *ilist, int *numj, int **firstneigh, @@ -432,17 +433,20 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double felec, const double off2_mpole, - double *host_q, double *boxlo, double *prd, void **tep_ptr) { +int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double felec, + const double off2_mpole, double *host_q, + double *boxlo, double *prd, void **tep_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -492,7 +496,8 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co _aewald = aewald; const int red_blocks=multipole_real(eflag,vflag); - // leave the answers (forces, energies and virial) on the device, only copy them back in the last kernel (polar_real) + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); //device->add_ans_object(ans); @@ -516,18 +521,21 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co // of the permanent field // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { +int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, + void** fieldp_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -587,18 +595,21 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i // of the induced field // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { +int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, + void** fieldp_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -657,19 +668,21 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double felec, - const double off2_polar, double *host_q, double *boxlo, - double *prd, void **tep_ptr) { +int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double felec, + const double off2_polar, double *host_q, + double *boxlo, double *prd, void **tep_ptr) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -719,7 +732,8 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const _aewald = aewald; const int red_blocks=polar_real(eflag,vflag); - // only copy answers (forces, energies and virial) back from the device in the last kernel (which is polar_real here) + // only copy answers (forces, energies and virial) back from the device + // in the last kernel (which is polar_real here) ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); @@ -746,8 +760,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const { template void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, - double** uind, double** uinp) { - + double** uind, double** uinp) { // signal that we need to transfer extra data from the host atom->extra_data_unavail(); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 0b6c09742e..a45316b6f3 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -192,8 +192,8 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - const double aewald, const double felec, const double off2_polar, double *charge, - double *boxlo, double *prd, void **tep_ptr); + const double aewald, const double felec, const double off2_polar, + double *charge, double *boxlo, double *prd, void **tep_ptr); /// Compute polar real-space with host neighboring (not active for now) void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, @@ -202,8 +202,9 @@ class BaseAmoeba { double **host_uinp, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, const double aewald, const double felec, const double off2_polar, - double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr); + const double cpu_time, bool &success, const double aewald, const double felec, + const double off2_polar, double *charge, const int nlocal, double *boxlo, + double *prd, void **tep_ptr); // -------------------------- DEVICE DATA ------------------------- @@ -271,7 +272,7 @@ class BaseAmoeba { int _extra_fields; double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors; double _gpu_overhead, _driver_overhead; - bool short_nbor_avail; + bool short_nbor_polar_avail; UCL_D_Vec *_nbor_data; numtyp _aewald,_felec; From 1166845fcf025292ac37646ed37e4b62d3bcc85b Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 18 Sep 2021 10:22:22 -0500 Subject: [PATCH 040/181] Prepared data structure for the dispersion real-space term --- lib/gpu/lal_amoeba.cpp | 38 ++++++--- lib/gpu/lal_amoeba.cu | 166 ++++++++++++++++++++++++++++++------ lib/gpu/lal_amoeba.h | 14 +-- lib/gpu/lal_amoeba_ext.cpp | 24 +++--- src/GPU/pair_amoeba_gpu.cpp | 12 +-- 5 files changed, 197 insertions(+), 57 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index d2f2b1bf79..28ed02b480 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -44,12 +44,14 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const { } template -int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_dirdamp, +int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -80,11 +82,22 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda host_write[i].x = host_pdamp[i]; host_write[i].y = host_thole[i]; host_write[i].z = host_dirdamp[i]; - host_write[i].w = (numtyp)0; + host_write[i].w = host_amtype2class[i]; } - damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); - ucl_copy(damping,host_write,false); + coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amtype,host_write,false); + + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amclass; i++) { + host_write2[i].x = host_csix[i]; + host_write2[i].y = host_adisp[i]; + host_write2[i].z = (numtyp)0; + host_write2[i].w = (numtyp)0; + } + + coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amclass,host_write2,false); UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); @@ -100,9 +113,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda _polar_uscale = polar_uscale; _allocated=true; - this->_max_bytes=damping.row_bytes() - + sp_polar.row_bytes() - + this->_tep.row_bytes(); + this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() + + sp_polar.row_bytes() + this->_tep.row_bytes(); return 0; } @@ -112,7 +124,7 @@ void AmoebaT::clear() { return; _allocated=false; - damping.clear(); + coeff_amtype.clear(); sp_polar.clear(); this->clear_atomic(); @@ -151,7 +163,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); - this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, @@ -192,7 +204,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { } this->k_udirect2b.set_size(GX,BX); - this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, @@ -232,7 +244,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { } this->k_umutual2b.set_size(GX,BX); - this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &this->_aewald, @@ -271,7 +283,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { } this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 41185f30e3..5a1151f610 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -147,7 +147,7 @@ _texture( q_tex,int2); fieldp[ii+inum] = fp; \ } -#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ @@ -210,8 +210,7 @@ _texture( q_tex,int2); } \ } -// SHUFFLE_AVAIL == 1 -#else +#else // SHUFFLE_AVAIL == 1 #define local_allocate_store_ufld() @@ -280,7 +279,7 @@ _texture( q_tex,int2); #if (EVFLAG == 1) -#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ @@ -376,7 +375,7 @@ _texture( q_tex,int2); // EVFLAG == 0 #else -#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ @@ -394,6 +393,125 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + dispersion = real-space portion of Ewald dispersion + adapted from Tinker edreal1d() routine +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + const int t_per_atom, const numtyp aewald, + const numtyp felec, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int l=0; l<6; l++) virial[l]=(acctyp)0; + } + + acctyp4 tq; + tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; + + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + if (iioff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + } // nbor + + } // ii { * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_dirdamp, - const double *host_special_mpole, + int init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, const double *host_dirdamp, + const int *host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -60,8 +61,11 @@ class Amoeba : public BaseAmoeba { // --------------------------- TYPE DATA -------------------------- - /// pdamp = damping.x; thole = damping.y - UCL_D_Vec damping; + /// pdamp = coeff_amtype.x; thole = coeff_amtype.y; + /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w + UCL_D_Vec coeff_amtype; + /// csix = coeff_amclass.x; adisp = coeff_amclass.y; + UCL_D_Vec coeff_amclass; /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 8493e9331d..804bf10f32 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -27,13 +27,14 @@ static Amoeba AMOEBAMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int amoeba_gpu_init(const int ntypes, const int max_amtype, +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, - const double *host_dirdamp, + const double *host_dirdamp, const int *host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -63,11 +64,13 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, int init_ok=0; if (world_me==0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, - host_special_mpole, host_special_polar_wscale, + init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - nlocal, nall, max_nbors, maxspecial, maxspecial15, - cell_size, gpu_split, screen, polar_dscale, polar_uscale); + host_csix, host_adisp, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); AMOEBAMF.device->world_barrier(); if (message) @@ -83,11 +86,12 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, - host_special_mpole, host_special_polar_wscale, + init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - nlocal, nall, max_nbors, maxspecial, maxspecial15, - cell_size, gpu_split, screen, polar_dscale, polar_uscale); + host_csix, host_adisp, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); AMOEBAMF.device->gpu_barrier(); if (message) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index f932f05e25..25f4718163 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -50,13 +50,14 @@ enum{GORDON1,GORDON2}; // External functions from cuda library for atom decomposition -int amoeba_gpu_init(const int ntypes, const int max_amtype, +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, - const double *host_dirdamp, + const double *host_dirdamp, const int* host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -168,9 +169,10 @@ void PairAmoebaGPU::init_style() int tq_size; int mnf = 5e-2 * neighbor->oneatom; - int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, - special_mpole, special_polar_wscale, special_polar_piscale, - special_polar_pscale, atom->nlocal, + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, + pdamp, thole, dirdamp, amtype2class, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, csix, adisp, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, polar_dscale, polar_uscale, tq_size); From 0228867d8e547feb35a28190285327df8081ccec Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 19 Sep 2021 23:40:43 -0500 Subject: [PATCH 041/181] Added the dispersion real space kernel and transfer special coeffs to the device --- lib/gpu/lal_amoeba.cpp | 16 +++- lib/gpu/lal_amoeba.cu | 156 +++++++++++++++++++++++++----------- lib/gpu/lal_amoeba.h | 8 ++ lib/gpu/lal_amoeba_ext.cpp | 14 +++- src/GPU/pair_amoeba_gpu.cpp | 9 ++- src/GPU/pair_amoeba_gpu.h | 3 + 6 files changed, 153 insertions(+), 53 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 28ed02b480..1d62e483d8 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -47,6 +47,9 @@ template int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, @@ -109,12 +112,21 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, } ucl_copy(sp_polar,dview,5,false); + sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_hal[i]; + dview[i].y=host_special_repel[i]; + dview[i].z=host_special_disp[i]; + dview[i].w=(numtyp)0; + } + ucl_copy(sp_nonpolar,dview,5,false); + _polar_dscale = polar_dscale; _polar_uscale = polar_uscale; _allocated=true; this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() - + sp_polar.row_bytes() + this->_tep.row_bytes(); + + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes(); return 0; } @@ -125,7 +137,9 @@ void AmoebaT::clear() { _allocated=false; coeff_amtype.clear(); + coeff_amclass.clear(); sp_polar.clear(); + sp_nonpolar.clear(); this->clear_atomic(); } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 5a1151f610..8915ef0146 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -400,8 +400,9 @@ _texture( q_tex,int2); __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, - const __global numtyp4 *restrict coeff, - const __global numtyp4 *restrict sp_polar, + const __global numtyp4 *restrict coeff_amtype, + const __global numtyp4 *restrict coeff_amclass, + const __global numtyp4 *restrict sp_disp, const __global int *dev_nbor, const __global int *dev_packed, const __global int *dev_short_nbor, @@ -428,20 +429,11 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, for (int l=0; l<6; l++) virial[l]=(acctyp)0; } - acctyp4 tq; - tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; - - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); if (iioff2) continue; - numtyp r = ucl_sqrt(r2); - numtyp ck = polar1[j].x; // rpole[j][0]; - numtyp dkx = polar1[j].y; // rpole[j][1]; - numtyp dky = polar1[j].z; // rpole[j][2]; - numtyp dkz = polar1[j].w; // rpole[j][3]; - numtyp qkxx = polar2[j].x; // rpole[j][4]; - numtyp qkxy = polar2[j].y; // rpole[j][5]; - numtyp qkxz = polar2[j].z; // rpole[j][6]; - numtyp qkyy = polar2[j].w; // rpole[j][8]; - numtyp qkyz = polar3[j].x; // rpole[j][9]; - numtyp qkzz = polar3[j].y; // rpole[j][12]; int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; + int jclass = coeff_amtype[jtype].w; // amtype2class[jtype]; + numtyp ck = coeff_amclass[jclass].x; // csix[jclass]; + numtyp ak = coeff_amclass[jclass].y; // adisp[jclass]; + numtyp r6 = r2*r2*r2; + numtyp ralpha2 = r2 * aewald*aewald; + numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2; + numtyp expterm = ucl_exp(-ralpha2); + numtyp expa = expterm * term; + + // find the damping factor for the dispersion interaction + + numtyp r = ucl_sqrt(r2); + numtyp r7 = r6 * r; + numtyp di = ai * r; + numtyp di2 = di * di; + numtyp di3 = di * di2; + numtyp dk = ak * r; + numtyp expi = ucl_exp(-di); + numtyp expk = ucl_exp(-dk); + + numtyp ai2,ak2; + numtyp di4,di5; + numtyp dk2,dk3; + numtyp ti,ti2; + numtyp tk,tk2; + numtyp damp3,damp5; + numtyp ddamp; + numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)]; + + if (ai != ak) { + ai2 = ai * ai; + ak2 = ak * ak; + dk2 = dk * dk; + dk3 = dk * dk2; + ti = ak2 / (ak2-ai2); + ti2 = ti * ti; + tk = ai2 / (ai2-ak2); + tk2 = tk * tk; + damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi + - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk + - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi + - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk; + damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi + - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk + - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi + - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk; + ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + + (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0); + + } else { + di4 = di2 * di2; + di5 = di2 * di3; + damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi; + damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi; + ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0; + } + + numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3; + + // apply damping and scaling factors for this interaction + + numtyp scale = factor_disp * damp*damp; + scale = scale - (numtyp )1.0; + numtyp e = -ci * ck * (expa+scale) / r6; + numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r; + numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7; + + energy+= e; + + // increment the damped dispersion derivative components + + numtyp dedx = de * xr; + numtyp dedy = de * yr; + numtyp dedz = de * zr; + f.x += dedx; + f.y += dedy; + f.z += dedz; + + // increment the internal virial tensor components + + numtyp vxx = xr * dedx; + numtyp vyx = yr * dedx; + numtyp vzx = zr * dedx; + numtyp vyy = yr * dedy; + numtyp vzy = zr * dedy; + numtyp vzz = zr * dedz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vyx; + virial[4] += vzx; + virial[5] += vzy; } // nbor } // ii { int init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int *host_amtype2class, const double *host_special_mpole, + const double *host_special_hal, const double *host_special_repel, + const double *host_special_disp, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -70,7 +72,13 @@ class Amoeba : public BaseAmoeba { /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, /// sp_polar.z = special_polar_piscale + /// sp_polar.w = special_mpole UCL_D_Vec sp_polar; + /// Special nonpolar values [0-4]: + /// sp_nonpolar.x = special_hal + /// sp_nonpolar.y special_repel + /// sp_nonpolar.z = special_disp + UCL_D_Vec sp_nonpolar; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 804bf10f32..86cf6f4c54 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -30,6 +30,9 @@ static Amoeba AMOEBAMF; int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, @@ -66,7 +69,9 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas if (world_me==0) init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_mpole, host_special_polar_wscale, + host_amtype2class, host_special_hal, + host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, host_csix, host_adisp, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, @@ -86,8 +91,11 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_mpole, host_special_polar_wscale, + init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_hal, + host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, host_csix, host_adisp, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 25f4718163..35bba58a14 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -53,7 +53,8 @@ enum{GORDON1,GORDON2}; int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int* host_amtype2class, - const double *host_special_mpole, + const double *host_special_hal, const double *host_special_repel, + const double *host_special_disp, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -116,6 +117,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) fieldp_pinned = nullptr; tq_pinned = nullptr; + gpu_hal_ready = false; + gpu_repulsion_ready = false; + gpu_dispersion_real_ready = false; gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; @@ -170,7 +174,8 @@ void PairAmoebaGPU::init_style() int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, - pdamp, thole, dirdamp, amtype2class, special_mpole, + pdamp, thole, dirdamp, amtype2class, special_hal, + special_repel, special_disp, special_mpole, special_polar_wscale, special_polar_piscale, special_polar_pscale, csix, adisp, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index d9a3fc5904..710f997e4c 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -47,6 +47,9 @@ class PairAmoebaGPU : public PairAmoeba { void *fieldp_pinned; bool tq_single; + bool gpu_hal_ready; + bool gpu_repulsion_ready; + bool gpu_dispersion_real_ready; bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; bool gpu_umutual2b_ready; From 4e88cd158ee21fc4fcdfc85d66073ea5b220f6bc Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 20 Sep 2021 11:38:50 -0500 Subject: [PATCH 042/181] Fixed bugs with _tep and _fieldp to allow mixed-precision builds, being defensive with acctyp for these variables --- lib/gpu/lal_amoeba.cu | 140 ++++++++++++++++++------------------- lib/gpu/lal_amoeba_ext.cpp | 2 +- lib/gpu/lal_base_amoeba.h | 2 +- 3 files changed, 72 insertions(+), 72 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 8915ef0146..3c5b949c72 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -102,7 +102,7 @@ _texture( q_tex,int2); dufld[5]=red_acc[5][tid]; \ } \ if (offset==0 && ii> SBBITS & 3; - int j = sj & NEIGHMASK; - tagint jtag = tag[j]; - - if (!which) { - int offset=ii; - for (int k=0; k> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; kgpu_rank(); int procs_per_gpu=AMOEBAMF.device->procs_per_gpu(); - tep_size=sizeof(PRECISION); + tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION); AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a45316b6f3..fea1728e8c 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -235,7 +235,7 @@ class BaseAmoeba { double** uind, double** uinp); /// Per-atom arrays - UCL_Vector _tep, _fieldp; + UCL_Vector _tep, _fieldp; int _nmax, _max_tep_size, _max_fieldp_size; // ------------------------ FORCE/ENERGY DATA ----------------------- From 42034bd1c9809a50b04dad411744039e02c67842 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 20 Sep 2021 12:48:29 -0500 Subject: [PATCH 043/181] Fixed bugs for undefined tagint and ucl_powr ambiguity in kernels for OpenCL builds --- lib/gpu/lal_amoeba.cu | 13 ++++++++++++- lib/gpu/lal_base_amoeba.cpp | 3 --- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 3c5b949c72..e44f302563 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -37,7 +37,18 @@ _texture( q_tex,int2); #else #define pos_tex x_ #define q_tex q_ +#ifdef LAMMPS_SMALLBIG +#define tagint int #endif +#ifdef LAMMPS_BIGBIG +#define tagint long +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif + +#endif // defined(NV_KERNEL) || defined(USE_HIP) + #if (SHUFFLE_AVAIL == 0) @@ -1042,7 +1053,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, } } else { pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] - damp = pgamma * ucl_powr(r/damp,3.0); + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); if (damp < (numtyp)50.0) { numtyp expdamp = ucl_exp(-damp); scale3 = (numtyp)1.0 - expdamp; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index e777981912..a5552f6f3b 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -27,9 +27,6 @@ BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_av nbor=new Neighbor(); pair_program=nullptr; ucl_device=nullptr; - #if defined(LAL_OCL_EV_JIT) - pair_program_noev=nullptr; - #endif } template From a2fd784034f1cf05ff0662b811f53c4f4dfc283f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 21 Sep 2021 10:55:38 -0500 Subject: [PATCH 044/181] Added the dispersion real space term, which is for HIPPO. --- lib/gpu/lal_amoeba.cpp | 49 +++++++++++++++++++++-- lib/gpu/lal_amoeba.cu | 11 +++--- lib/gpu/lal_amoeba.h | 9 +++-- lib/gpu/lal_amoeba_ext.cpp | 17 ++++++++ lib/gpu/lal_base_amoeba.cpp | 79 +++++++++++++++++++++++++++++++++++-- lib/gpu/lal_base_amoeba.h | 31 ++++++++++----- src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 63 ++++++++++++++++++++++++++++- src/GPU/pair_amoeba_gpu.h | 1 + 9 files changed, 234 insertions(+), 28 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 1d62e483d8..a9e02ee7b4 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -62,9 +62,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, - "k_amoeba_multipole", "k_amoeba_udirect2b", - "k_amoeba_umutual2b", "k_amoeba_polar", - "k_amoeba_short_nbor"); + "k_amoeba_dispersion", "k_amoeba_multipole", + "k_amoeba_udirect2b", "k_amoeba_umutual2b", + "k_amoeba_polar", "k_amoeba_short_nbor"); if (success!=0) return success; @@ -150,7 +150,48 @@ double AmoebaT::host_memory_usage() const { } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Calculate the dispersion real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::dispersion_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole, + // at this point mpole is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_disp, &ainum, + &nbor_pitch, &this->_threads_per_atom); + printf("launching dispersion\n"); + this->k_dispersion.set_size(GX,BX); + this->k_dispersion.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_disp); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Calculate the multipole real-space term, returning tep // --------------------------------------------------------------------------- template int AmoebaT::multipole_real(const int eflag, const int vflag) { diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index e44f302563..60205b16ff 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -413,7 +413,7 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, - const __global numtyp4 *restrict sp_disp, + const __global numtyp4 *restrict sp_nonpolar, const __global int *dev_nbor, const __global int *dev_packed, const __global int *dev_short_nbor, @@ -422,8 +422,7 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, const int t_per_atom, const numtyp aewald, - const numtyp felec, const numtyp off2, - const numtyp polar_dscale, const numtyp polar_uscale) + const numtyp off2) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -876,9 +875,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, // accumulate tq store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep); - // accumate force, energy and virial + // accumate force, energy and virial: use _acc if not the first kernel store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv); + //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, + // offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } /* ---------------------------------------------------------------------- @@ -1785,7 +1786,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, // accumate force, energy and virial //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, -// offset,eflag,vflag,ans,engv); + // offset,eflag,vflag,ans,engv); store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index 39d65375cb..df556a1018 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -38,9 +38,11 @@ class Amoeba : public BaseAmoeba { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, const int max_amtype, const int max_amclass, - const double *host_pdamp, const double *host_thole, const double *host_dirdamp, - const int *host_amtype2class, const double *host_special_mpole, - const double *host_special_hal, const double *host_special_repel, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_mpole, + const double *host_special_hal, + const double *host_special_repel, const double *host_special_disp, const double *host_special_polar_wscale, const double *host_special_polar_piscale, @@ -91,6 +93,7 @@ class Amoeba : public BaseAmoeba { protected: bool _allocated; + int dispersion_real(const int eflag, const int vflag); int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 55c08adf82..309830e1ce 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -117,6 +117,23 @@ void amoeba_gpu_clear() { AMOEBAMF.clear(); } +int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd) { + return AMOEBAMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd); +} + int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index a5552f6f3b..f252131ea7 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -33,6 +33,7 @@ template BaseAmoebaT::~BaseAmoeba() { delete ans; delete nbor; + k_dispersion.clear(); k_multipole.clear(); k_udirect2b.clear(); k_umutual2b.clear(); @@ -54,6 +55,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, + const char *k_name_dispersion, const char *k_name_multipole, const char *k_name_udirect2b, const char *k_name_umutual2b, @@ -90,8 +92,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_multipole,k_name_udirect2b, - k_name_umutual2b,k_name_polar,k_name_short_nbor); + compile_kernels(*ucl_device,pair_program,k_name_dispersion,k_name_multipole, + k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -427,7 +429,74 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute polar real-space +// Reneighbor on GPU if necessary, and then compute dispersion real-space +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_disp, + double *host_q, double *boxlo, double *prd) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + _off2_disp = off2_disp; + _aewald = aewald; + const int red_blocks=dispersion_real(eflag,vflag); + + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + + hd_balancer.stop_timer(); + + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- template int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, @@ -816,6 +885,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname_dispersion, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, @@ -828,7 +898,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, pair_program=new UCL_Program(dev); std::string oclstring = device->compile_string()+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); - + + k_dispersion.set_function(*pair_program,kname_dispersion); k_multipole.set_function(*pair_program,kname_multipole); k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index fea1728e8c..fcff3186c7 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -54,9 +54,9 @@ class BaseAmoeba { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, - const char *kname_multipole, const char *kname_udirect2b, - const char *kname_umutual2b, const char *kname_polar, - const char *kname_short_nbor); + const char *kname_dispersion, const char *kname_multipole, + const char *kname_udirect2b, const char *kname_umutual2b, + const char *kname_polar, const char *kname_short_nbor); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -142,6 +142,18 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); + /// Compute dispersion real-space with device neighboring + int** compute_dispersion_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double off2_disp, double *charge, + double *boxlo, double *prd); + /// Compute multipole real-space with device neighboring int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -257,8 +269,8 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_special15; - UCL_Kernel k_short_nbor; + UCL_Kernel k_dispersion, k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -276,13 +288,14 @@ class BaseAmoeba { UCL_D_Vec *_nbor_data; numtyp _aewald,_felec; - numtyp _off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar; + numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *kname_multipole, const char *kname_udirect2b, - const char *kname_umutual2b, const char *kname_polar, - const char *kname_short_nbor); + const char *kname_dispersion, const char *kname_multipole, + const char *kname_udirect2b, const char *kname_umutual2b, + const char *kname_polar, const char *kname_short_nbor); + virtual int dispersion_real(const int eflag, const int vflag) = 0; virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 72c142888e..8a2f09d443 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -348,7 +348,7 @@ class PairAmoeba : public Pair { int, double, double, double *); void dispersion(); - void dispersion_real(); + virtual void dispersion_real(); void dispersion_kspace(); void multipole(); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 35bba58a14..4894ac6203 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -65,6 +65,17 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas const double polar_dscale, const double polar_uscale, int& tq_size); void amoeba_gpu_clear(); +int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd); + int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, @@ -118,8 +129,8 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) tq_pinned = nullptr; gpu_hal_ready = false; - gpu_repulsion_ready = false; - gpu_dispersion_real_ready = false; + gpu_repulsion_ready = false; // true for HIPPO + gpu_dispersion_real_ready = false; // true for HIPPO gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; @@ -194,6 +205,54 @@ void PairAmoebaGPU::init_style() /* ---------------------------------------------------------------------- */ +void PairAmoebaGPU::dispersion_real() +{ + if (!gpu_dispersion_real_ready) { + PairAmoeba::dispersion_real(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_dewald) choose(DISP_LONG); + else choose(DISP); + + firstneigh = amoeba_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); +} + +/* ---------------------------------------------------------------------- */ + void PairAmoebaGPU::multipole_real() { if (!gpu_multipole_real_ready) { diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 710f997e4c..de17703dc7 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -35,6 +35,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void induce(); + virtual void dispersion_real(); virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual2b(double **, double **); From d77d5b7f0a1db4b4cc2eec14c1b2ecd9ba49936b Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 21 Sep 2021 15:40:06 -0500 Subject: [PATCH 045/181] Added classes for hippo/gpu, refactored BaseAmoeba and made room for the dispersion real-space term in hippo --- lib/gpu/lal_amoeba.cpp | 43 +- lib/gpu/lal_amoeba.h | 1 - lib/gpu/lal_amoeba_ext.cpp | 4 +- lib/gpu/lal_base_amoeba.cpp | 73 +- lib/gpu/lal_base_amoeba.h | 19 +- lib/gpu/lal_hippo.cpp | 430 ++++++++ lib/gpu/lal_hippo.cu | 1892 +++++++++++++++++++++++++++++++++++ lib/gpu/lal_hippo.h | 120 +++ lib/gpu/lal_hippo_ext.cpp | 210 ++++ src/GPU/pair_amoeba_gpu.cpp | 65 +- src/GPU/pair_amoeba_gpu.h | 2 +- src/GPU/pair_hippo_gpu.cpp | 1175 ++++++++++++++++++++++ src/GPU/pair_hippo_gpu.h | 80 ++ 13 files changed, 3918 insertions(+), 196 deletions(-) create mode 100644 lib/gpu/lal_hippo.cpp create mode 100644 lib/gpu/lal_hippo.cu create mode 100644 lib/gpu/lal_hippo.h create mode 100644 lib/gpu/lal_hippo_ext.cpp create mode 100644 src/GPU/pair_hippo_gpu.cpp create mode 100644 src/GPU/pair_hippo_gpu.h diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index a9e02ee7b4..8d9af4706e 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -62,7 +62,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, - "k_amoeba_dispersion", "k_amoeba_multipole", + "k_amoeba_multipole", "k_amoeba_udirect2b", "k_amoeba_umutual2b", "k_amoeba_polar", "k_amoeba_short_nbor"); if (success!=0) @@ -149,47 +149,6 @@ double AmoebaT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Amoeba); } -// --------------------------------------------------------------------------- -// Calculate the dispersion real-space term, returning tep -// --------------------------------------------------------------------------- -template -int AmoebaT::dispersion_real(const int eflag, const int vflag) { - int ainum=this->ans->inum(); - if (ainum == 0) - return 0; - - int _nall=this->atom->nall(); - int nbor_pitch=this->nbor->nbor_pitch(); - - // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - this->time_pair.start(); - - // Build the short neighbor list for the cutoff off2_mpole, - // at this point mpole is the first kernel in a time step - - this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, - &this->_nbor_data->begin(), - &this->dev_short_nbor, &this->_off2_disp, &ainum, - &nbor_pitch, &this->_threads_per_atom); - printf("launching dispersion\n"); - this->k_dispersion.set_size(GX,BX); - this->k_dispersion.run(&this->atom->x, &this->atom->extra, - &coeff_amtype, &coeff_amclass, &sp_nonpolar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &this->_aewald, - &this->_off2_disp); - this->time_pair.stop(); - - return GX; -} - // --------------------------------------------------------------------------- // Calculate the multipole real-space term, returning tep // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index df556a1018..04eb6e4aa9 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -93,7 +93,6 @@ class Amoeba : public BaseAmoeba { protected: bool _allocated; - int dispersion_real(const int eflag, const int vflag); int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 309830e1ce..565f16b627 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -116,7 +116,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas void amoeba_gpu_clear() { AMOEBAMF.clear(); } - +/* int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, @@ -133,7 +133,7 @@ int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, off2, host_q, boxlo, prd); } - +*/ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index f252131ea7..b8e927d6ce 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -33,7 +33,6 @@ template BaseAmoebaT::~BaseAmoeba() { delete ans; delete nbor; - k_dispersion.clear(); k_multipole.clear(); k_udirect2b.clear(); k_umutual2b.clear(); @@ -55,7 +54,6 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name_dispersion, const char *k_name_multipole, const char *k_name_udirect2b, const char *k_name_umutual2b, @@ -92,7 +90,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_dispersion,k_name_multipole, + compile_kernels(*ucl_device,pair_program,k_name_multipole, k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor); if (_threads_per_atom>1 && gpu_nbor==0) { @@ -428,73 +426,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall return nbor->host_jlist.begin()-host_start; } -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute dispersion real-space -// --------------------------------------------------------------------------- -template -int** BaseAmoebaT::compute_dispersion_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_disp, - double *host_q, double *boxlo, double *prd) { - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); - - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. - - int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); - - _off2_disp = off2_disp; - _aewald = aewald; - const int red_blocks=dispersion_real(eflag,vflag); - - // leave the answers (forces, energies and virial) on the device, - // only copy them back in the last kernel (polar_real) - //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //device->add_ans_object(ans); - - hd_balancer.stop_timer(); - - return firstneigh; // nbor->host_jlist.begin()-host_start; -} - // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- @@ -885,7 +816,6 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname_dispersion, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, @@ -899,7 +829,6 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, std::string oclstring = device->compile_string()+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); - k_dispersion.set_function(*pair_program,kname_dispersion); k_multipole.set_function(*pair_program,kname_multipole); k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index fcff3186c7..40da00f176 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -54,7 +54,7 @@ class BaseAmoeba { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, - const char *kname_dispersion, const char *kname_multipole, + const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, const char *kname_short_nbor); @@ -142,18 +142,6 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - /// Compute dispersion real-space with device neighboring - int** compute_dispersion_real(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - const double aewald, const double off2_disp, double *charge, - double *boxlo, double *prd); - /// Compute multipole real-space with device neighboring int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -269,7 +257,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_dispersion, k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar; UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -291,11 +279,10 @@ class BaseAmoeba { numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *kname_dispersion, const char *kname_multipole, + const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, const char *kname_short_nbor); - virtual int dispersion_real(const int eflag, const int vflag) = 0; virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp new file mode 100644 index 0000000000..7fa358e35a --- /dev/null +++ b/lib/gpu/lal_hippo.cpp @@ -0,0 +1,430 @@ +/*************************************************************************** + hippo.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Class for acceleration of the hippo pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "hippo_cl.h" +#elif defined(USE_CUDART) +const char *hippo=0; +#else +#include "hippo_cubin.h" +#endif + +#include "lal_hippo.h" +#include +namespace LAMMPS_AL { +#define HippoT Hippo + +extern Device device; + +template +HippoT::Hippo() : BaseAmoeba(), + _allocated(false) { +} + +template +HippoT::~Hippo() { + clear(); + k_dispersion.clear(); +} + +template +int HippoT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, + cell_size,gpu_split,_screen,hippo, + "k_hippo_multipole", + "k_hippo_udirect2b", "k_hippo_umutual2b", + "k_hippo_polar", "k_hippo_short_nbor"); + if (success!=0) + return success; + + k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion"); + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + + UCL_H_Vec host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_pdamp[i]; + host_write[i].y = host_thole[i]; + host_write[i].z = host_dirdamp[i]; + host_write[i].w = host_amtype2class[i]; + } + + coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amtype,host_write,false); + + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amclass; i++) { + host_write2[i].x = host_csix[i]; + host_write2[i].y = host_adisp[i]; + host_write2[i].z = (numtyp)0; + host_write2[i].w = (numtyp)0; + } + + coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amclass,host_write2,false); + + UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); + sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_polar_wscale[i]; + dview[i].y=host_special_polar_piscale[i]; + dview[i].z=host_special_polar_pscale[i]; + dview[i].w=host_special_mpole[i]; + } + ucl_copy(sp_polar,dview,5,false); + + sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<5; i++) { + dview[i].x=host_special_hal[i]; + dview[i].y=host_special_repel[i]; + dview[i].z=host_special_disp[i]; + dview[i].w=(numtyp)0; + } + ucl_copy(sp_nonpolar,dview,5,false); + + _polar_dscale = polar_dscale; + _polar_uscale = polar_uscale; + + _allocated=true; + this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() + + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes(); + return 0; +} + +template +void HippoT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff_amtype.clear(); + coeff_amclass.clear(); + sp_polar.clear(); + sp_nonpolar.clear(); + + this->clear_atomic(); +} + +template +double HippoT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(Hippo); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute dispersion real-space +// --------------------------------------------------------------------------- + +template +int** HippoT::compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_disp, + double *host_q, double *boxlo, double *prd) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + this->_off2_disp = off2_disp; + this->_aewald = aewald; + const int red_blocks=dispersion_real(eflag,vflag); + + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + + this->hd_balancer.stop_timer(); + + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Calculate the dispersion real-space term, returning tep +// --------------------------------------------------------------------------- +template +int HippoT::dispersion_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole, + // at this point mpole is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_disp, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + k_dispersion.set_size(GX,BX); + k_dispersion.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_disp); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Calculate the multipole real-space term, returning tep +// --------------------------------------------------------------------------- +template +int HippoT::multipole_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole, + // at this point mpole is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_mpole, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + this->k_multipole.set_size(GX,BX); + this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Calculate the real-space permanent field, returning field and fieldp +// --------------------------------------------------------------------------- +template +int HippoT::udirect2b(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_udirect2b.set_size(GX,BX); + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_off2_polar, + &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Calculate the real-space induced field, returning field and fieldp +// --------------------------------------------------------------------------- +template +int HippoT::umutual2b(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->dev_short_nbor, + &this->_off2_polar, &ainum, &nbor_pitch, + &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_umutual2b.set_size(GX,BX); + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, + &nbor_pitch, &this->_threads_per_atom, &this->_aewald, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Calculate the polar real-space term, returning tep +// --------------------------------------------------------------------------- +template +int HippoT::polar_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list if not done yet + if (!this->short_nbor_polar_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_polar, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_polar_avail = true; + } + + this->k_polar.set_size(GX,BX); + this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_polar, &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_polar_avail = false; + + return GX; +} + +template class Hippo; +} diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu new file mode 100644 index 0000000000..a21afe6cd8 --- /dev/null +++ b/lib/gpu/lal_hippo.cu @@ -0,0 +1,1892 @@ +// ************************************************************************** +// hippo.cu +// ------------------- +// Trung Dac Nguyen (Northwestern) +// +// Device code for acceleration of the hippo pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : trung.nguyen@northwestern.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include +#include "lal_aux_fun1.h" +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#include "inttypes.h" +#define tagint int64_t +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#ifdef LAMMPS_SMALLBIG +#define tagint int +#endif +#ifdef LAMMPS_BIGBIG +#define tagint long +#endif +#ifdef LAMMPS_SMALLSMALL +#define tagint int +#endif + +#endif // defined(NV_KERNEL) || defined(USE_HIP) + + +#if (SHUFFLE_AVAIL == 0) + +#define local_allocate_store_ufld() \ + __local acctyp red_acc[6][BLOCK_PAIR]; + +#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ + tep) \ + if (t_per_atom>1) { \ + red_acc[0][tid]=tq.x; \ + red_acc[1][tid]=tq.y; \ + red_acc[2][tid]=tq.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + tq.x=red_acc[0][tid]; \ + tq.y=red_acc[1][tid]; \ + tq.z=red_acc[2][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=ufld[0]; \ + red_acc[1][tid]=ufld[1]; \ + red_acc[2][tid]=ufld[2]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<3; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + ufld[0]=red_acc[0][tid]; \ + ufld[1]=red_acc[1][tid]; \ + ufld[2]=red_acc[2][tid]; \ + red_acc[0][tid]=dufld[0]; \ + red_acc[1][tid]=dufld[1]; \ + red_acc[2][tid]=dufld[2]; \ + red_acc[3][tid]=dufld[3]; \ + red_acc[4][tid]=dufld[4]; \ + red_acc[5][tid]=dufld[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + dufld[0]=red_acc[0][tid]; \ + dufld[1]=red_acc[1][tid]; \ + dufld[2]=red_acc[2][tid]; \ + dufld[3]=red_acc[3][tid]; \ + dufld[4]=red_acc[4][tid]; \ + dufld[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + red_acc[0][tid]=_fieldp[0]; \ + red_acc[1][tid]=_fieldp[1]; \ + red_acc[2][tid]=_fieldp[2]; \ + red_acc[3][tid]=_fieldp[3]; \ + red_acc[4][tid]=_fieldp[4]; \ + red_acc[5][tid]=_fieldp[5]; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + simdsync(); \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + _fieldp[0]=red_acc[0][tid]; \ + _fieldp[1]=red_acc[1][tid]; \ + _fieldp[2]=red_acc[2][tid]; \ + _fieldp[3]=red_acc[3][tid]; \ + _fieldp[4]=red_acc[4][tid]; \ + _fieldp[5]=red_acc[5][tid]; \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ + if (EVFLAG && (vflag==2 || eflag==2)) { \ + if (eflag) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ + } \ + if (vflag) { \ + simdsync(); \ + simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ + } \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + tq.x += shfl_down(tq.x, s, t_per_atom); \ + tq.y += shfl_down(tq.y, s, t_per_atom); \ + tq.z += shfl_down(tq.z, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + ufld[0] += shfl_down(ufld[0], s, t_per_atom); \ + ufld[1] += shfl_down(ufld[1], s, t_per_atom); \ + ufld[2] += shfl_down(ufld[2], s, t_per_atom); \ + dufld[0] += shfl_down(dufld[0], s, t_per_atom); \ + dufld[1] += shfl_down(dufld[1], s, t_per_atom); \ + dufld[2] += shfl_down(dufld[2], s, t_per_atom); \ + dufld[3] += shfl_down(dufld[3], s, t_per_atom); \ + dufld[4] += shfl_down(dufld[4], s, t_per_atom); \ + dufld[5] += shfl_down(dufld[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ + _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom); \ + _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom); \ + _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom); \ + _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom); \ + _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (vflag==2 || eflag==2) { \ + if (eflag) \ + simd_reduce_add2(t_per_atom,energy,e_coul); \ + if (vflag) \ + simd_reduce_arr(6, t_per_atom,virial); \ + } \ + } \ + if (offset==0 && ii 1; active_subgs /= vwidth) { \ + if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ + if (bnum < active_subgs) { \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (voffset==0) { \ + red_acc[6][bnum] = energy; \ + red_acc[7][bnum] = e_coul; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (voffset==0) \ + for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ + } \ + } \ + \ + __syncthreads(); \ + if (tid < active_subgs) { \ + if (eflag) { \ + energy = red_acc[6][tid]; \ + e_coul = red_acc[7][tid]; \ + } \ + if (vflag) \ + for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ + } else { \ + if (eflag) energy = e_coul = (acctyp)0; \ + if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ + } \ + } \ + \ + if (bnum == 0) { \ + int ei=BLOCK_ID_X; \ + if (eflag) { \ + simd_reduce_add2(vwidth, energy, e_coul); \ + if (tid==0) { \ + engv[ei]+=energy*(acctyp)0.5; \ + ei+=ev_stride; \ + engv[ei]+=e_coul*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + if (vflag) { \ + simd_reduce_arr(6, vwidth, virial); \ + if (tid==0) { \ + for (int r=0; r<6; r++) { \ + engv[ei]+=virial[r]*(acctyp)0.5; \ + ei+=ev_stride; \ + } \ + } \ + } \ + } \ + } else if (offset==0 && ii1) \ + simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ + if (offset==0 && iioff2) continue; + + int jtype = polar3[j].z; // amtype[j]; + int jclass = coeff_amtype[jtype].w; // amtype2class[jtype]; + numtyp ck = coeff_amclass[jclass].x; // csix[jclass]; + numtyp ak = coeff_amclass[jclass].y; // adisp[jclass]; + + numtyp r6 = r2*r2*r2; + numtyp ralpha2 = r2 * aewald*aewald; + numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2; + numtyp expterm = ucl_exp(-ralpha2); + numtyp expa = expterm * term; + + // find the damping factor for the dispersion interaction + + numtyp r = ucl_sqrt(r2); + numtyp r7 = r6 * r; + numtyp di = ai * r; + numtyp di2 = di * di; + numtyp di3 = di * di2; + numtyp dk = ak * r; + numtyp expi = ucl_exp(-di); + numtyp expk = ucl_exp(-dk); + + numtyp ai2,ak2; + numtyp di4,di5; + numtyp dk2,dk3; + numtyp ti,ti2; + numtyp tk,tk2; + numtyp damp3,damp5; + numtyp ddamp; + numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)]; + + if (ai != ak) { + ai2 = ai * ai; + ak2 = ak * ak; + dk2 = dk * dk; + dk3 = dk * dk2; + ti = ak2 / (ak2-ai2); + ti2 = ti * ti; + tk = ai2 / (ai2-ak2); + tk2 = tk * tk; + damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi + - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk + - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi + - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk; + damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi + - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk + - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi + - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk; + ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + + (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0); + + } else { + di4 = di2 * di2; + di5 = di2 * di3; + damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi; + damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi; + ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0; + } + + numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3; + + // apply damping and scaling factors for this interaction + + numtyp scale = factor_disp * damp*damp; + scale = scale - (numtyp )1.0; + numtyp e = -ci * ck * (expa+scale) / r6; + numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r; + numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7; + + energy+= e; + + // increment the damped dispersion derivative components + + numtyp dedx = de * xr; + numtyp dedy = de * yr; + numtyp dedz = de * zr; + f.x += dedx; + f.y += dedy; + f.z += dedz; + + // increment the internal virial tensor components + + numtyp vxx = xr * dedx; + numtyp vyx = yr * dedx; + numtyp vzx = zr * dedx; + numtyp vyy = yr * dedy; + numtyp vzy = zr * dedy; + numtyp vzz = zr * dedz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vyx; + virial[4] += vzx; + virial[5] += vzy; + } // nbor + + } // iioff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)]; + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + numtyp dik = dix*dkx + diy*dky + diz*dkz; + numtyp qik = qix*qkx + qiy*qky + qiz*qkz; + numtyp diqk = dix*qkx + diy*qky + diz*qkz; + numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + qixx*qkxx + qiyy*qkyy + qizz*qkzz; + + // additional intermediates involving moments and distance + + numtyp dirx = diy*zr - diz*yr; + numtyp diry = diz*xr - dix*zr; + numtyp dirz = dix*yr - diy*xr; + numtyp dkrx = dky*zr - dkz*yr; + numtyp dkry = dkz*xr - dkx*zr; + numtyp dkrz = dkx*yr - dky*xr; + numtyp dikx = diy*dkz - diz*dky; + numtyp diky = diz*dkx - dix*dkz; + numtyp dikz = dix*dky - diy*dkx; + numtyp qirx = qiz*yr - qiy*zr; + numtyp qiry = qix*zr - qiz*xr; + numtyp qirz = qiy*xr - qix*yr; + numtyp qkrx = qkz*yr - qky*zr; + numtyp qkry = qkx*zr - qkz*xr; + numtyp qkrz = qky*xr - qkx*yr; + numtyp qikx = qky*qiz - qkz*qiy; + numtyp qiky = qkz*qix - qkx*qiz; + numtyp qikz = qkx*qiy - qky*qix; + numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz; + numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz; + numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz; + numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz; + numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz; + numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz; + numtyp qikrx = qizk*yr - qiyk*zr; + numtyp qikry = qixk*zr - qizk*xr; + numtyp qikrz = qiyk*xr - qixk*yr; + numtyp qkirx = qkzi*yr - qkyi*zr; + numtyp qkiry = qkxi*zr - qkzi*xr; + numtyp qkirz = qkyi*xr - qkxi*yr; + numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz; + numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz; + numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz; + numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz; + numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz; + numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz; + numtyp diqkrx = diqkz*yr - diqky*zr; + numtyp diqkry = diqkx*zr - diqkz*xr; + numtyp diqkrz = diqky*xr - diqkx*yr; + numtyp dkqirx = dkqiz*yr - dkqiy*zr; + numtyp dkqiry = dkqix*zr - dkqiz*xr; + numtyp dkqirz = dkqiy*xr - dkqix*yr; + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); + + // get reciprocal distance terms for this interaction + + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = felec * rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + numtyp rr9 = (numtyp)7.0 * rr7 * r2inv; + numtyp rr11 = (numtyp)9.0 * rr9 * r2inv; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; + numtyp alsq2n = (numtyp)0.0; + if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m < 6; m++) { + bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 6; m++) bn[m] *= felec; + + term1 = ci*ck; + term2 = ck*dir - ci*dkr + dik; + term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); + term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik; + term5 = qir*qkr; + numtyp scalek = (numtyp)1.0 - factor_mpole; + rr1 = bn[0] - scalek*rr1; + rr3 = bn[1] - scalek*rr3; + rr5 = bn[2] - scalek*rr5; + rr7 = bn[3] - scalek*rr7; + rr9 = bn[4] - scalek*rr9; + rr11 = bn[5] - scalek*rr11; + numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9; + + // find standard multipole intermediates for force and torque + + numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11; + term1 = -ck*rr3 + dkr*rr5 - qkr*rr7; + term2 = ci*rr3 + dir*rr5 + qir*rr7; + term3 = (numtyp)2.0 * rr5; + term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); + term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9); + term6 = (numtyp)4.0 * rr7; + + energy += e; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + + // compute the torque components for this interaction + + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + + // increment force-based gradient and torque on first site + + f.x += frcx; + f.y += frcy; + f.z += frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vxy; + virial[4] += vxz; + virial[5] += vyz; + } + } // nbor + + } // ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + numtyp factor_dscale, factor_pscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find the field components for Thole polarization damping + + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp scale7 = (numtyp)1.0; + numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp) ; + scale3 = (numtyp)1.0 - expdamp ; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); + } + } else { + pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_dscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + scalek = factor_pscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + numtyp ukx = polar4[j].x; // uind[j][0]; + numtyp uky = polar4[j].y; // uind[j][1]; + numtyp ukz = polar4[j].z; // uind[j][2]; + numtyp ukxp = polar5[j].x; // uinp[j][0]; + numtyp ukyp = polar5[j].y; // uinp[j][1]; + numtyp ukzp = polar5[j].z; // uinp[j][2]; + + numtyp factor_uscale; + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = (numtyp)1.0; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find terms needed later to compute mutual polarization + // if (poltyp != DIRECT) + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + } + + } else { // damp == 0: ??? + } + + numtyp scalek = factor_uscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + + numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip + tdipdip[0] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[1] = bcn[1]*xr*yr; + tdipdip[2] = bcn[1]*xr*zr; + tdipdip[3] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[4] = bcn[1]*yr*zr; + tdipdip[5] = -bcn[0] + bcn[1]*zr*zr; + //if (i==0 && j == 10) + // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", + // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); + fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; + fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; + fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; + + fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; + fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; + fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // iioff2) continue; + + numtyp r = ucl_sqrt(r2); + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + numtyp ukx = polar4[j].x; // uind[j][0]; + numtyp uky = polar4[j].y; // uind[j][1]; + numtyp ukz = polar4[j].z; // uind[j][2]; + numtyp ukxp = polar5[j].x; // uinp[j][0]; + numtyp ukyp = polar5[j].y; // uinp[j][1]; + numtyp ukzp = polar5[j].z; // uinp[j][2]; + + numtyp factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + numtyp uir = uix*xr + uiy*yr + uiz*zr; + numtyp uirp = uixp*xr + uiyp*yr + uizp*zr; + numtyp ukr = ukx*xr + uky*yr + ukz*zr; + numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr; + + // get reciprocal distance terms for this interaction + + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = felec * rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + numtyp rr9 = (numtyp)7.0 * rr7 * r2inv; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; + numtyp alsq2n = (numtyp)0.0; + if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for (m = 1; m <= 4; m++) { + bfac = (numtyp) (m+m-1); + alsq2n = alsq2 * alsq2n; + bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; + } + for (m = 0; m < 5; m++) bn[m] *= felec; + + // apply Thole polarization damping to scale factors + + numtyp sc3 = (numtyp)1.0; + numtyp sc5 = (numtyp)1.0; + numtyp sc7 = (numtyp)1.0; + for (k = 0; k < 3; k++) { + rc3[k] = (numtyp)0.0; + rc5[k] = (numtyp)0.0; + rc7[k] = (numtyp)0.0; + } + + // apply Thole polarization damping to scale factors + + numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + sc3 = (numtyp)1.0 - expdamp; + sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp; + sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp; + numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv; + numtyp temp5 = damp; + numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp; + rc3[0] = xr * temp3; + rc3[1] = yr * temp3; + rc3[2] = zr * temp3; + rc5[0] = rc3[0] * temp5; + rc5[1] = rc3[1] * temp5; + rc5[2] = rc3[2] * temp5; + rc7[0] = rc5[0] * temp7; + rc7[1] = rc5[1] * temp7; + rc7[2] = rc5[2] * temp7; + } + + psc3 = (numtyp)1.0 - sc3*factor_pscale; + psc5 = (numtyp)1.0 - sc5*factor_pscale; + psc7 = (numtyp)1.0 - sc7*factor_pscale; + dsc3 = (numtyp)1.0 - sc3*factor_dscale; + dsc5 = (numtyp)1.0 - sc5*factor_dscale; + dsc7 = (numtyp)1.0 - sc7*factor_dscale; + usc3 = (numtyp)1.0 - sc3*factor_uscale; + usc5 = (numtyp)1.0 - sc5*factor_uscale; + psr3 = bn[1] - psc3*rr3; + psr5 = bn[2] - psc5*rr5; + psr7 = bn[3] - psc7*rr7; + dsr3 = bn[1] - dsc3*rr3; + dsr5 = bn[2] - dsc5*rr5; + dsr7 = bn[3] - dsc7*rr7; + usr5 = bn[2] - usc5*rr5; + for (k = 0; k < 3; k++) { + prc3[k] = rc3[k] * factor_pscale; + prc5[k] = rc5[k] * factor_pscale; + prc7[k] = rc7[k] * factor_pscale; + drc3[k] = rc3[k] * factor_dscale; + drc5[k] = rc5[k] * factor_dscale; + drc7[k] = rc7[k] * factor_dscale; + urc3[k] = rc3[k] * factor_uscale; + urc5[k] = rc5[k] * factor_uscale; + } + } else { // damp == 0: ??? + } + + // get the induced dipole field used for dipole torques + + numtyp tix3 = psr3*ukx + dsr3*ukxp; + numtyp tiy3 = psr3*uky + dsr3*ukyp; + numtyp tiz3 = psr3*ukz + dsr3*ukzp; + numtyp tuir = -psr5*ukr - dsr5*ukrp; + + ufld[0] += tix3 + xr*tuir; + ufld[1] += tiy3 + yr*tuir; + ufld[2] += tiz3 + zr*tuir; + + // get induced dipole field gradient used for quadrupole torques + + numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp); + numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp); + numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp); + tuir = -psr7*ukr - dsr7*ukrp; + + dufld[0] += xr*tix5 + xr*xr*tuir; + dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; + dufld[2] += yr*tiy5 + yr*yr*tuir; + dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; + dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; + dufld[5] += zr*tiz5 + zr*zr*tuir; + + // get the dEd/dR terms used for direct polarization force + + term1 = bn[2] - dsc3*rr5; + term2 = bn[3] - dsc5*rr7; + term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr - dsr5*xr; + term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0]; + term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr; + numtyp tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; + numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr - dsr5*yr; + term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1]; + term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr; + numtyp tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2]; + term4 = rr3*drc3[2] - term1*zr - dsr5*zr; + term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2]; + term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2]; + term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr; + numtyp tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*xr*yr - rr3*yr*drc3[0]; + term4 = rr3*drc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0]; + term7 = rr5*drc5[0] - term2*xr; + numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6; + + term3 = term1*xr*zr - rr3*zr*drc3[0]; + term5 = term2*xr*zr - rr5*zr*drc5[0]; + term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0]; + numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + term3 = term1*yr*zr - rr3*zr*drc3[1]; + term4 = rr3*drc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*drc5[1]; + term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1]; + term7 = rr5*drc5[1] - term2*yr; + numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp; + numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp; + numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp; + + numtyp frcx = depx; + numtyp frcy = depy; + numtyp frcz = depz; + + // get the dEp/dR terms used for direct polarization force + + // tixx and tkxx + term1 = bn[2] - psc3*rr5; + term2 = bn[3] - psc5*rr7; + term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr - psr5*xr; + term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0]; + term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr; + tixx = ci*term3 + dix*term4 + dir*term5 + + (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; + tkxx = ck*term3 - dkx*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; + + // tiyy and tkyy + term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr - psr5*yr; + term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1]; + term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr; + tiyy = ci*term3 + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6; + tkyy = ck*term3 - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6; + + // tizz and tkzz + term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2]; + term4 = rr3*prc3[2] - term1*zr - psr5*zr; + term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2]; + term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2]; + term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr; + tizz = ci*term3 + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6; + tkzz = ck*term3 - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tixy and tkxy + term3 = term1*xr*yr - rr3*yr*prc3[0]; + term4 = rr3*prc3[0] - term1*xr; + term5 = term2*xr*yr - rr5*yr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0]; + term7 = rr5*prc5[0] - term2*xr; + tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 + + (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6; + tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6; + + // tixz and tkxz + term3 = term1*xr*zr - rr3*zr*prc3[0]; + term5 = term2*xr*zr - rr5*zr*prc5[0]; + term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0]; + tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6; + tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6; + + // tiyz and tkyz + term3 = term1*yr*zr - rr3*zr*prc3[1]; + term4 = rr3*prc3[1] - term1*yr; + term5 = term2*yr*zr - rr5*zr*prc5[1]; + term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1]; + term7 = rr5*prc5[1] - term2*yr; + tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 + + (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6; + tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 + + (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6; + + depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz; + depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz; + depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + // get the dtau/dr terms used for mutual polarization force + // poltyp == MUTUAL && amoeba + + term1 = bn[2] - usc3*rr5; + term2 = bn[3] - usc5*rr7; + term3 = usr5 + term1; + term4 = rr3 * factor_uscale; + term5 = -xr*term3 + rc3[0]*term4; + term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0]; + tixx = uix*term5 + uir*term6; + tkxx = ukx*term5 + ukr*term6; + + term5 = -yr*term3 + rc3[1]*term4; + term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1]; + tiyy = uiy*term5 + uir*term6; + tkyy = uky*term5 + ukr*term6; + + term5 = -zr*term3 + rc3[2]*term4; + term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2]; + tizz = uiz*term5 + uir*term6; + tkzz = ukz*term5 + ukr*term6; + + term4 = -usr5 * yr; + term5 = -xr*term1 + rr3*urc3[0]; + term6 = xr*yr*term2 - rr5*yr*urc5[0]; + tixy = uix*term4 + uiy*term5 + uir*term6; + tkxy = ukx*term4 + uky*term5 + ukr*term6; + + term4 = -usr5 * zr; + term6 = xr*zr*term2 - rr5*zr*urc5[0]; + tixz = uix*term4 + uiz*term5 + uir*term6; + tkxz = ukx*term4 + ukz*term5 + ukr*term6; + + term5 = -yr*term1 + rr3*urc3[1]; + term6 = yr*zr*term2 - rr5*zr*urc5[1]; + tiyz = uiy*term4 + uiz*term5 + uir*term6; + tkyz = uky*term4 + ukz*term5 + ukr*term6; + + depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + + tkxx*uixp + tkxy*uiyp + tkxz*uizp; + depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + + tkxy*uixp + tkyy*uiyp + tkyz*uizp; + depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + + tkxz*uixp + tkyz*uiyp + tkzz*uizp; + + frcx = frcx + depx; + frcy = frcy + depy; + frcz = frcz + depz; + + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; + + if (EVFLAG && vflag) { + numtyp vxx = xr * frcx; + numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz); + numtyp vyy = yr * frcy; + numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz); + numtyp vzz = zr * frcz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vxy; + virial[4] += vxz; + virial[5] += vyz; + } + } // nbor + + } // ii> SBBITS & 3; + int j = sj & NEIGHMASK; + tagint jtag = tag[j]; + + if (!which) { + int offset=ii; + for (int k=0; k +class Hippo : public BaseAmoeba { + public: + Hippo(); + ~Hippo(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_mpole, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, const double cell_size, + const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale); + + /// Compute dispersion real-space with device neighboring + int** compute_dispersion_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double off2_disp, double *charge, + double *boxlo, double *prd); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// pdamp = coeff_amtype.x; thole = coeff_amtype.y; + /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w + UCL_D_Vec coeff_amtype; + /// csix = coeff_amclass.x; adisp = coeff_amclass.y; + UCL_D_Vec coeff_amclass; + /// Special polar values [0-4]: + /// sp_polar.x = special_polar_wscale + /// sp_polar.y special_polar_pscale, + /// sp_polar.z = special_polar_piscale + /// sp_polar.w = special_mpole + UCL_D_Vec sp_polar; + /// Special nonpolar values [0-4]: + /// sp_nonpolar.x = special_hal + /// sp_nonpolar.y special_repel + /// sp_nonpolar.z = special_disp + UCL_D_Vec sp_nonpolar; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _polar_dscale, _polar_uscale; + numtyp _qqrd2e; + + UCL_Kernel k_dispersion; + + protected: + bool _allocated; + int dispersion_real(const int eflag, const int vflag); + int multipole_real(const int eflag, const int vflag); + int udirect2b(const int eflag, const int vflag); + int umutual2b(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + +}; + +} + +#endif diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp new file mode 100644 index 0000000000..b9e31e7b20 --- /dev/null +++ b/lib/gpu/lal_hippo_ext.cpp @@ -0,0 +1,210 @@ +/*************************************************************************** + hippo_ext.cpp + ------------------- + Trung Dac Nguyen (Northwestern) + + Functions for LAMMPS access to hippo acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : trung.nguyen@northwestern.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_hippo.h" + +using namespace std; +using namespace LAMMPS_AL; + +static Hippo HIPPOMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_hal, + const double *host_special_repel, + const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale, + int& tep_size) { + HIPPOMF.clear(); + gpu_mode=HIPPOMF.device->gpu_mode(); + double gpu_split=HIPPOMF.device->particle_split(); + int first_gpu=HIPPOMF.device->first_device(); + int last_gpu=HIPPOMF.device->last_device(); + int world_me=HIPPOMF.device->world_me(); + int gpu_rank=HIPPOMF.device->gpu_rank(); + int procs_per_gpu=HIPPOMF.device->procs_per_gpu(); + + tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION); + + HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu); + + bool message=false; + if (HIPPOMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing GPU and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_hal, + host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_csix, host_adisp, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); + + HIPPOMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + HIPPOMF.estimate_gpu_overhead(); + return init_ok; +} + +void hippo_gpu_clear() { + HIPPOMF.clear(); +} + +int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd) { + return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd); +} + +int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +} + +int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr) { + return HIPPOMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); +} + +int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr) { + return HIPPOMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); +} + +int** hippo_gpu_compute_polar_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +} + +double hippo_gpu_bytes() { + return HIPPOMF.host_memory_usage(); +} diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 4894ac6203..91bc679447 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -65,17 +65,6 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas const double polar_dscale, const double polar_uscale, int& tq_size); void amoeba_gpu_clear(); -int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd); - int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, @@ -128,9 +117,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) fieldp_pinned = nullptr; tq_pinned = nullptr; - gpu_hal_ready = false; - gpu_repulsion_ready = false; // true for HIPPO - gpu_dispersion_real_ready = false; // true for HIPPO + gpu_hal_ready = false; // true for AMOEBA when ready + gpu_repulsion_ready = false; // always false for AMOEBA + gpu_dispersion_real_ready = false; // always false for AMOEBA gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; @@ -205,54 +194,6 @@ void PairAmoebaGPU::init_style() /* ---------------------------------------------------------------------- */ -void PairAmoebaGPU::dispersion_real() -{ - if (!gpu_dispersion_real_ready) { - PairAmoeba::dispersion_real(); - return; - } - - int eflag=1, vflag=1; - int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; - - double sublo[3],subhi[3]; - if (domain->triclinic == 0) { - sublo[0] = domain->sublo[0]; - sublo[1] = domain->sublo[1]; - sublo[2] = domain->sublo[2]; - subhi[0] = domain->subhi[0]; - subhi[1] = domain->subhi[1]; - subhi[2] = domain->subhi[2]; - } else { - domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); - } - inum = atom->nlocal; - - // select the correct cutoff for the term - - if (use_dewald) choose(DISP_LONG); - else choose(DISP); - - firstneigh = amoeba_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, - domain->boxlo, domain->prd); - - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); -} - -/* ---------------------------------------------------------------------- */ - void PairAmoebaGPU::multipole_real() { if (!gpu_multipole_real_ready) { diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index de17703dc7..e0210faa68 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -35,7 +35,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void induce(); - virtual void dispersion_real(); + //virtual void dispersion_real(); virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual2b(double **, double **); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp new file mode 100644 index 0000000000..ce0051962b --- /dev/null +++ b/src/GPU/pair_hippo_gpu.cpp @@ -0,0 +1,1175 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (Northwestern) +------------------------------------------------------------------------- */ + +#include "pair_hippo_gpu.h" + +#include "amoeba_convolution.h" +#include "atom.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "fix_store.h" +#include "force.h" +#include "gpu_extra.h" +#include "math_const.h" +#include "memory.h" +#include "my_page.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor.h" +#include "suffix.h" +#include + +using namespace LAMMPS_NS; +using namespace MathConst; + +enum{INDUCE,RSD,SETUP_hippo,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm +enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm +enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; +enum{MUTUAL,OPT,TCG,DIRECT}; +enum{GEAR,ASPC,LSQR}; +enum{BUILD,APPLY}; +enum{GORDON1,GORDON2}; + +#define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye + +// External functions from cuda library for atom decomposition + +int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int* host_amtype2class, + const double *host_special_hal, const double *host_special_repel, + const double *host_special_disp, const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, int &gpu_mode, FILE *screen, + const double polar_dscale, const double polar_uscale, int& tq_size); +void hippo_gpu_clear(); + +int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd); + +int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tq_ptr); + +int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr); + +int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, double *host_q, + double *boxlo, double *prd, void **fieldp_ptr); + +int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double felec, const double off2, + double *host_q, double *boxlo, double *prd, void **tq_ptr); + +double hippo_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + fieldp_pinned = nullptr; + tq_pinned = nullptr; + + gpu_hal_ready = false; // always false for HIPPO + gpu_repulsion_ready = false; // true for HIPPO when ready + gpu_dispersion_real_ready = false; // true for HIPPO when ready + gpu_multipole_real_ready = true; + gpu_udirect2b_ready = true; + gpu_umutual2b_ready = true; + gpu_polar_real_ready = true; + + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairHippoGPU::~PairHippoGPU() +{ + hippo_gpu_clear(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairHippoGPU::init_style() +{ + PairAmoeba::init_style(); + + // Repeat cutsq calculation because done after call to init_style + + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + int maxspecial15=0; + if (atom->molecular != Atom::ATOMIC) { + maxspecial=atom->maxspecial; + maxspecial15=atom->maxspecial15; + } + + int tq_size; + int mnf = 5e-2 * neighbor->oneatom; + int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, + pdamp, thole, dirdamp, amtype2class, special_hal, + special_repel, special_disp, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, csix, adisp, atom->nlocal, + atom->nlocal+atom->nghost, mnf, maxspecial, + maxspecial15, cell_size, gpu_mode, screen, + polar_dscale, polar_uscale, tq_size); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) + error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now"); + + if (tq_size == sizeof(double)) + tq_single = false; + else + tq_single = true; +} + +/* ---------------------------------------------------------------------- */ + +void PairHippoGPU::dispersion_real() +{ + if (!gpu_dispersion_real_ready) { + PairAmoeba::dispersion_real(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_dewald) choose(DISP_LONG); + else choose(DISP); + + firstneigh = hippo_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); +} + +/* ---------------------------------------------------------------------- */ + +void PairHippoGPU::multipole_real() +{ + if (!gpu_multipole_real_ready) { + PairAmoeba::multipole_real(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + // set the energy unit conversion factor for multipolar real-space calculation + + double felec = electric / am_dielectric; + + firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (tq_single) { + float *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, fmpole, virmpole); + } else { + double *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, fmpole, virmpole); + } +} + +/* ---------------------------------------------------------------------- + induce = induced dipole moments via pre-conditioned CG solver + adapted from Tinker induce0a() routine +------------------------------------------------------------------------- */ + +void PairHippoGPU::induce() +{ + bool done; + int i,j,m,ii,itype; + int iter,maxiter; + double polmin; + double eps,epsold; + double epsd,epsp; + double udsum,upsum; + double a,ap,b,bp; + double sum,sump,term; + double reduce[4],allreduce[4]; + + double *poli; + double **conj,**conjp; + double **vec,**vecp; + double **udir,**usum,**usump; + + int debug = 1; + + // set cutoffs, taper coeffs, and PME params + // create qfac here, free at end of polar() + + if (use_ewald) { + choose(POLAR_LONG); + int nmine = p_kspace->nfft_owned; + memory->create(qfac,nmine,"ameoba/induce:qfac"); + } else choose(POLAR); + + // owned atoms + + double **x = atom->x; + double **f = atom->f; + int nlocal = atom->nlocal; + + // zero out the induced dipoles at each site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + } + } + + // allocation of arrays + // NOTE: not all are used by all methods + // NOTE: could be re-allocated dynamically + + memory->create(poli,nlocal,"ameoba/induce:poli"); + memory->create(conj,nlocal,3,"ameoba/induce:conj"); + memory->create(conjp,nlocal,3,"ameoba/induce:conjp"); + memory->create(vec,nlocal,3,"ameoba/induce:vec"); + memory->create(vecp,nlocal,3,"ameoba/induce:vecp"); + memory->create(udir,nlocal,3,"ameoba/induce:udir"); + memory->create(usum,nlocal,3,"ameoba/induce:usum"); + memory->create(usump,nlocal,3,"ameoba/induce:usump"); + + // get the electrostatic field due to permanent multipoles + + dfield0c(field,fieldp); + + // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only + + if (!gpu_udirect2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } + + // set induced dipoles to polarizability times direct field + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + udir[i][j] = polarity[itype] * field[i][j]; + udirp[i][j] = polarity[itype] * fieldp[i][j]; + if (pcgguess) { + uind[i][j] = udir[i][j]; + uinp[i][j] = udirp[i][j]; + } + } + } +/* + printf("GPU: cutghost = %f\n", comm->cutghost[0]); + for (i = 0; i < 10; i++) { + printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", + i, udir[i][0], udir[i][1], udir[i][2], + udirp[i][0], udirp[i][1], udirp[i][2]); + } +*/ + // get induced dipoles via the OPT extrapolation method + // NOTE: any way to rewrite these loops to avoid allocating + // uopt,uoptp with a optorder+1 dimension, just optorder ?? + // since no need to store optorder+1 values after these loops + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uopt[i][0][j] = udir[i][j]; + uoptp[i][0][j] = udirp[i][j]; + } + } + + for (m = 1; m <= optorder; m++) { + optlevel = m - 1; // used in umutual1() for fopt,foptp + + cfstyle = INDUCE; + comm->forward_comm_pair(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + for (j = 0; j < 3; j++) { + uopt[i][m][j] = polarity[itype] * field[i][j]; + uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; + uind[i][j] = uopt[i][m][j]; + uinp[i][j] = uoptp[i][m][j]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = 0.0; + uinp[i][j] = 0.0; + usum[i][j] = 0.0; + usump[i][j] = 0.0; + for (m = 0; m <= optorder; m++) { + usum[i][j] += uopt[i][m][j]; + usump[i][j] += uoptp[i][m][j]; + uind[i][j] += copt[m]*usum[i][j]; + uinp[i][j] += copt[m]*usump[i][j]; + } + } + } + } + + // set tolerances for computation of mutual induced dipoles + + if (poltyp == MUTUAL) { + done = false; + maxiter = 100; + iter = 0; + polmin = 0.00000001; + eps = 100.0; + + // estimate induced dipoles using a polynomial predictor + + if (use_pred && nualt == maxualt) { + ulspred(); + + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + udsum = 0.0; + upsum = 0.0; + for (m = 0; m < nualt; m++) { + udsum += bpred[m]*udalt[i][m][j]; + upsum += bpredp[m]*upalt[i][m][j]; + } + uind[i][j] = udsum; + uinp[i][j] = upsum; + } + } + } + + // estimate induced dipoles via inertial extended Lagrangian + // not supported for now + // requires uaux,upaux to persist with each atom + // also requires a velocity vector(s) to persist + // also requires updating uaux,upaux in the Verlet integration + + /* + if (use_ielscf) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uaux[i][j]; + uinp[i][j] = upaux[i][j]; + } + } + } + */ + + // get the electrostatic field due to induced dipoles + + cfstyle = INDUCE; + comm->forward_comm_pair(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } + + //error->all(FLERR,"STOP GPU"); + + // set initial conjugate gradient residual and conjugate vector + + for (i = 0; i < nlocal; i++) { + itype = amtype[i]; + + poli[i] = MAX(polmin,polarity[itype]); + for (j = 0; j < 3; j++) { + if (pcgguess) { + rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j]; + rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j]; + } else { + rsd[i][j] = udir[i][j] / poli[i]; + rsdp[i][j] = udirp[i][j] / poli[i]; + } + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm_pair(this); + uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm_pair(this); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j]; + conjp[i][j] = zrsdp[i][j]; + } + } + + // conjugate gradient iteration of the mutual induced dipoles + + while (!done) { + iter++; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + vec[i][j] = uind[i][j]; + vecp[i][j] = uinp[i][j]; + uind[i][j] = conj[i][j]; + uinp[i][j] = conjp[i][j]; + } + } + + cfstyle = INDUCE; + comm->forward_comm_pair(this); + + ufield0c(field,fieldp); + + if (!gpu_umutual2b_ready) { + crstyle = FIELD; + comm->reverse_comm_pair(this); + } + + //error->all(FLERR,"STOP"); + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = vec[i][j]; + uinp[i][j] = vecp[i][j]; + vec[i][j] = conj[i][j]/poli[i] - field[i][j]; + vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j]; + } + } + + a = 0.0; + ap = 0.0; + sum = 0.0; + sump = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + a += conj[i][j]*vec[i][j]; + ap += conjp[i][j]*vecp[i][j]; + sum += rsd[i][j]*zrsd[i][j]; + sump += rsdp[i][j]*zrsdp[i][j]; + } + } + + reduce[0] = a; + reduce[1] = ap; + reduce[2] = sum; + reduce[3] = sump; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + a = allreduce[0]; + ap = allreduce[1]; + sum = allreduce[2]; + sump = allreduce[3]; + + if (a != 0.0) a = sum / a; + if (ap != 0.0) ap = sump / ap; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + uind[i][j] = uind[i][j] + a*conj[i][j]; + uinp[i][j] = uinp[i][j] + ap*conjp[i][j]; + rsd[i][j] = rsd[i][j] - a*vec[i][j]; + rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j]; + zrsd[i][j] = rsd[i][j]; + zrsdp[i][j] = rsdp[i][j]; + } + } + + if (pcgprec) { + cfstyle = RSD; + comm->forward_comm_pair(this); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + crstyle = ZRSD; + comm->reverse_comm_pair(this); + } + + b = 0.0; + bp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + b += rsd[i][j]*zrsd[i][j]; + bp += rsdp[i][j]*zrsdp[i][j]; + } + } + + // NOTE: comp of b,bp and allreduce only needed if pcgprec ? + + reduce[0] = b; + reduce[1] = bp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + b = allreduce[0]; + bp = allreduce[1]; + + if (sum != 0.0) b /= sum; + if (sump != 0.0) bp /= sump; + + epsd = 0.0; + epsp = 0.0; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + conj[i][j] = zrsd[i][j] + b*conj[i][j]; + conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j]; + epsd += rsd[i][j]*rsd[i][j]; + epsp += rsdp[i][j]*rsdp[i][j]; + } + } + + reduce[0] = epsd; + reduce[1] = epsp; + MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); + epsd = allreduce[0]; + epsp = allreduce[1]; + + // check the convergence of the mutual induced dipoles + + epsold = eps; + eps = MAX(epsd,epsp); + eps = DEBYE * sqrt(eps/atom->natoms); + + if (eps < poleps) done = true; + if (eps > epsold) done = true; + if (iter >= politer) done = true; + + // apply a "peek" iteration to the mutual induced dipoles + + if (done) { + for (i = 0; i < nlocal; i++) { + term = pcgpeek * poli[i]; + for (j = 0; j < 3; j++) { + uind[i][j] += term*rsd[i][j]; + uinp[i][j] += term*rsdp[i][j]; + } + } + } + + } + + // terminate the calculation if dipoles failed to converge + // NOTE: could make this an error + + if (iter >= maxiter || eps > epsold) + if (me == 0) + error->warning(FLERR,"hippo induced dipoles did not converge"); + } + + // DEBUG output to dump file + + if (uind_flag) + dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp); + + // deallocation of arrays + + memory->destroy(poli); + memory->destroy(conj); + memory->destroy(conjp); + memory->destroy(vec); + memory->destroy(vecp); + memory->destroy(udir); + memory->destroy(usum); + memory->destroy(usump); + + // update the lists of previous induced dipole values + // shift previous m values up to m+1, add new values at m = 0 + // only when preconditioner is used + + if (use_pred) { + double ***udalt = fixudalt->tstore; + double ***upalt = fixupalt->tstore; + + nualt = MIN(nualt+1,maxualt); + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + for (m = nualt-1; m > 0; m--) { + udalt[i][m][j] = udalt[i][m-1][j]; + upalt[i][m][j] = upalt[i][m-1][j]; + } + udalt[i][0][j] = uind[i][j]; + upalt[i][0][j] = uinp[i][j]; + } + } + } +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::udirect2b(double **field, double **fieldp) +{ + if (!gpu_udirect2b_ready) { + PairAmoeba::udirect2b(field, fieldp); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + firstneigh = hippo_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + uind, uinp, sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, &fieldp_pinned); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() + // udirect2b_cpu(); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) + + int nlocal = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + +} + +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::udirect2b_cpu() +{ + int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup; + double xr,yr,zr,r,r2; + double rr1,rr2,rr3,rr5; + double bfac,exp2a; + double ralpha,aefac; + double aesq2,aesq2n; + double pdi,pti,ddi; + double pgamma; + double damp,expdamp; + double scale3,scale5; + double scale7,scalek; + double bn[4],bcn[3]; + double factor_dscale,factor_pscale,factor_uscale,factor_wscale; + + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + double **x = atom->x; + + // neigh list + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + // NOTE: doesn't this have a problem if aewald is tiny ?? + + aesq2 = 2.0 * aewald * aewald; + aesq2n = 0.0; + if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); + + // rebuild dipole-dipole pair list and store pairwise dipole matrices + // done one atom at a time in real-space double loop over atoms & neighs + + int *neighptr; + double *tdipdip; + + // compute the real space portion of the Ewald summation + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + itype = amtype[i]; + igroup = amgroup[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + n = ndip = 0; + neighptr = ipage_dipole->vget(); + tdipdip = dpage_dipdip->vget(); + + pdi = pdamp[itype]; + pti = thole[itype]; + ddi = dirdamp[itype]; + + // evaluate all sites within the cutoff distance + + for (jj = 0; jj < jnum; jj++) { + jextra = jlist[jj]; + j = jextra & NEIGHMASK15; + + xr = x[j][0] - x[i][0]; + yr = x[j][1] - x[i][1]; + zr = x[j][2] - x[i][2]; + r2 = xr*xr + yr* yr + zr*zr; + if (r2 > off2) continue; + + jtype = amtype[j]; + jgroup = amgroup[j]; + + factor_wscale = special_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = special_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = special_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = 1.0; + } + + r = sqrt(r2); + rr1 = 1.0 / r; + rr2 = rr1 * rr1; + rr3 = rr2 * rr1; + rr5 = 3.0 * rr2 * rr3; + + // calculate the real space Ewald error function terms + + ralpha = aewald * r; + bn[0] = erfc(ralpha) * rr1; + exp2a = exp(-ralpha*ralpha); + aefac = aesq2n; + for (m = 1; m <= 3; m++) { + bfac = m+m-1; + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; + } + + // find terms needed later to compute mutual polarization + + if (poltyp != DIRECT) { + scale3 = 1.0; + scale5 = 1.0; + damp = pdi * pdamp[jtype]; + if (damp != 0.0) { + pgamma = MIN(pti,thole[jtype]); + damp = pgamma * pow(r/damp,3.0); + if (damp < 50.0) { + expdamp = exp(-damp); + scale3 = 1.0 - expdamp; + scale5 = 1.0 - expdamp*(1.0+damp); + } + } + scalek = factor_uscale; + bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; + + neighptr[n++] = j; + tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; + tdipdip[ndip++] = bcn[1]*xr*yr; + tdipdip[ndip++] = bcn[1]*xr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; + tdipdip[ndip++] = bcn[1]*yr*zr; + tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; + } else { + if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); + } + + } // jj + + firstneigh_dipole[i] = neighptr; + firstneigh_dipdip[i] = tdipdip; + numneigh_dipole[i] = n; + ipage_dipole->vgot(n); + dpage_dipdip->vgot(ndip); + } +} + +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairHippoGPU::umutual2b(double **field, double **fieldp) +{ + if (!gpu_umutual2b_ready) { + PairAmoeba::umutual2b(field, fieldp); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff (off2) for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + firstneigh = hippo_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + uind, uinp, sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success,aewald, off2, atom->q, + domain->boxlo, domain->prd, &fieldp_pinned); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (umutual1) + + int nlocal = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } +} + +/* ---------------------------------------------------------------------- */ + +void PairHippoGPU::polar_real() +{ + if (!gpu_polar_real_ready) { + PairAmoeba::polar_real(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff and aewald for the term + + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); + + // set the energy unit conversion factor for polar real-space calculation + + double felec = 0.5 * electric / am_dielectric; + + firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, + rpole, uind, uinp, sublo, subhi, + atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (tq_single) { + float *tep_ptr = (float *)tq_pinned; + compute_force_from_torque(tep_ptr, fpolar, virpolar); + } else { + double *tep_ptr = (double *)tq_pinned; + compute_force_from_torque(tep_ptr, fpolar, virpolar); + } +} + +/* ---------------------------------------------------------------------- + compute atom forces from torques +------------------------------------------------------------------------- */ + +template +void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, + double** force_comp, + double* virial_comp) +{ + int i,ix,iy,iz; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double fix[3],fiy[3],fiz[3],_tq[4]; + + double** x = atom->x; + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + _tq[0] = tq_ptr[4*i]; + _tq[1] = tq_ptr[4*i+1]; + _tq[2] = tq_ptr[4*i+2]; + torque2force(i,_tq,fix,fiy,fiz,force_comp); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + + virial_comp[0] += vxx; + virial_comp[1] += vyy; + virial_comp[2] += vzz; + virial_comp[3] += vxy; + virial_comp[4] += vxz; + virial_comp[5] += vyz; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairHippoGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + hippo_gpu_bytes(); +} diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h new file mode 100644 index 0000000000..9e961045eb --- /dev/null +++ b/src/GPU/pair_hippo_gpu.h @@ -0,0 +1,80 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(hippo/gpu,PairHippoGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_HIPPO_GPU_H +#define LMP_PAIR_HIPPO_GPU_H + +#include "pair_amoeba.h" + +namespace LAMMPS_NS { + +class PairHippoGPU : public PairAmoeba { + public: + PairHippoGPU(LAMMPS *lmp); + ~PairHippoGPU(); + void init_style(); + double memory_usage(); + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + virtual void induce(); + + virtual void dispersion_real(); + virtual void multipole_real(); + virtual void udirect2b(double **, double **); + virtual void umutual2b(double **, double **); + virtual void polar_real(); + + private: + int gpu_mode; + double cpu_time; + void *tq_pinned; + void *fieldp_pinned; + bool tq_single; + + bool gpu_hal_ready; + bool gpu_repulsion_ready; + bool gpu_dispersion_real_ready; + bool gpu_multipole_real_ready; + bool gpu_udirect2b_ready; + bool gpu_umutual2b_ready; + bool gpu_polar_real_ready; + + void udirect2b_cpu(); + + template + void compute_force_from_torque(const numtyp*, double**, double*); +}; + +} // namespace LAMMPS_NS +#endif +#endif + +/* ERROR/WARNING messages: + +E: Insufficient memory on accelerator + +There is insufficient memory on one of the devices specified for the gpu +package + +E: Pair style hippo/gpu requires atom attribute q + +The atom style defined does not have this attribute. + +*/ From bebef1849596eae20ec825cd56375e572124b3d3 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 21 Sep 2021 23:46:21 -0500 Subject: [PATCH 046/181] Cleaned up and minor changes --- lib/gpu/lal_amoeba.cu | 383 ++++++++++++------------------------------ 1 file changed, 111 insertions(+), 272 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 60205b16ff..e4d129214a 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -404,189 +404,6 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 -/* ---------------------------------------------------------------------- - dispersion = real-space portion of Ewald dispersion - adapted from Tinker edreal1d() routine -------------------------------------------------------------------------- */ - -__kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict coeff_amtype, - const __global numtyp4 *restrict coeff_amclass, - const __global numtyp4 *restrict sp_nonpolar, - const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - const int t_per_atom, const numtyp aewald, - const numtyp off2) -{ - int tid, ii, offset, i; - atom_info(t_per_atom,ii,tid,offset); - - int n_stride; - local_allocate_store_charge(); - - acctyp4 f; - f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; - acctyp energy, e_coul, virial[6]; - if (EVFLAG) { - energy=(acctyp)0; - e_coul=(acctyp)0; - for (int l=0; l<6; l++) virial[l]=(acctyp)0; - } - - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - - if (iioff2) continue; - - int jtype = polar3[j].z; // amtype[j]; - int jclass = coeff_amtype[jtype].w; // amtype2class[jtype]; - numtyp ck = coeff_amclass[jclass].x; // csix[jclass]; - numtyp ak = coeff_amclass[jclass].y; // adisp[jclass]; - - numtyp r6 = r2*r2*r2; - numtyp ralpha2 = r2 * aewald*aewald; - numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2; - numtyp expterm = ucl_exp(-ralpha2); - numtyp expa = expterm * term; - - // find the damping factor for the dispersion interaction - - numtyp r = ucl_sqrt(r2); - numtyp r7 = r6 * r; - numtyp di = ai * r; - numtyp di2 = di * di; - numtyp di3 = di * di2; - numtyp dk = ak * r; - numtyp expi = ucl_exp(-di); - numtyp expk = ucl_exp(-dk); - - numtyp ai2,ak2; - numtyp di4,di5; - numtyp dk2,dk3; - numtyp ti,ti2; - numtyp tk,tk2; - numtyp damp3,damp5; - numtyp ddamp; - numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)]; - - if (ai != ak) { - ai2 = ai * ai; - ak2 = ak * ak; - dk2 = dk * dk; - dk3 = dk * dk2; - ti = ak2 / (ak2-ai2); - ti2 = ti * ti; - tk = ai2 / (ai2-ak2); - tk2 = tk * tk; - damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi - - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk - - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi - - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk; - damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi - - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk - - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi - - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk; - ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + - (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0); - - } else { - di4 = di2 * di2; - di5 = di2 * di3; - damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi; - damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi; - ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0; - } - - numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3; - - // apply damping and scaling factors for this interaction - - numtyp scale = factor_disp * damp*damp; - scale = scale - (numtyp )1.0; - numtyp e = -ci * ck * (expa+scale) / r6; - numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r; - numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7; - - energy+= e; - - // increment the damped dispersion derivative components - - numtyp dedx = de * xr; - numtyp dedy = de * yr; - numtyp dedz = de * zr; - f.x += dedx; - f.y += dedy; - f.z += dedz; - - // increment the internal virial tensor components - - numtyp vxx = xr * dedx; - numtyp vyx = yr * dedx; - numtyp vzx = zr * dedx; - numtyp vyy = yr * dedy; - numtyp vzy = zr * dedy; - numtyp vzz = zr * dedz; - - virial[0] += vxx; - virial[1] += vyy; - virial[2] += vzz; - virial[3] += vyx; - virial[4] += vzx; - virial[5] += vzy; - } // nbor - - } // iioff2) continue; numtyp r = ucl_sqrt(r2); - numtyp ck = polar1[j].x; // rpole[j][0]; - numtyp dkx = polar1[j].y; // rpole[j][1]; - numtyp dky = polar1[j].z; // rpole[j][2]; - numtyp dkz = polar1[j].w; // rpole[j][3]; - numtyp qkxx = polar2[j].x; // rpole[j][4]; - numtyp qkxy = polar2[j].y; // rpole[j][5]; - numtyp qkxz = polar2[j].z; // rpole[j][6]; - numtyp qkyy = polar2[j].w; // rpole[j][8]; - numtyp qkyz = polar3[j].x; // rpole[j][9]; - numtyp qkzz = polar3[j].y; // rpole[j][12]; - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; + const numtyp4 pol1j = polar1[j]; + numtyp ck = pol1j.x; // rpole[j][0]; + numtyp dkx = pol1j.y; // rpole[j][1]; + numtyp dky = pol1j.z; // rpole[j][2]; + numtyp dkz = pol1j.w; // rpole[j][3]; + const numtyp4 pol2j = polar2[j]; + numtyp qkxx = pol2j.x; // rpole[j][4]; + numtyp qkxy = pol2j.y; // rpole[j][5]; + numtyp qkxz = pol2j.z; // rpole[j][6]; + numtyp qkyy = pol2j.w; // rpole[j][8]; + const numtyp4 pol3j = polar3[j]; + numtyp qkyz = pol3j.x; // rpole[j][9]; + numtyp qkzz = pol3j.y; // rpole[j][12]; + int jtype = pol3j.z; // amtype[j]; + int jgroup = pol3j.w; // amgroup[j]; const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)]; @@ -910,7 +732,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; numtyp4* polar1 = (numtyp4*)(&extra[0]); numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); @@ -933,21 +754,23 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, nbor_mem = dev_short_nbor; } - int itype,igroup; numtyp bn[4],bcn[3]; numtyp fid[3],fip[3]; - - dix = polar1[i].y; // rpole[i][1]; - diy = polar1[i].z; // rpole[i][2]; - diz = polar1[i].w; // rpole[i][3]; - qixx = polar2[i].x; // rpole[i][4]; - qixy = polar2[i].y; // rpole[i][5]; - qixz = polar2[i].z; // rpole[i][6]; - qiyy = polar2[i].w; // rpole[i][8]; - qiyz = polar3[i].x; // rpole[i][9]; - qizz = polar3[i].y; // rpole[i][12]; - itype = polar3[i].z; // amtype[i]; - igroup = polar3[i].w; // amgroup[i]; + + const numtyp4 pol1i = polar1[i]; + numtyp dix = pol1i.y; // rpole[i][1]; + numtyp diy = pol1i.z; // rpole[i][2]; + numtyp diz = pol1i.w; // rpole[i][3]; + const numtyp4 pol2i = polar2[i]; + numtyp qixx = pol2i.x; // rpole[i][4]; + numtyp qixy = pol2i.y; // rpole[i][5]; + numtyp qixz = pol2i.z; // rpole[i][6]; + numtyp qiyy = pol2i.w; // rpole[i][8]; + const numtyp4 pol3i = polar3[i]; + numtyp qiyz = pol3i.x; // rpole[i][9]; + numtyp qizz = pol3i.y; // rpole[i][12]; + int itype = pol3i.z; // amtype[i]; + int igroup = pol3i.w; // amgroup[i]; // debug: // xi__ = ix; xi__.w = itype; @@ -984,18 +807,21 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; - numtyp ck = polar1[j].x; // rpole[j][0]; - numtyp dkx = polar1[j].y; // rpole[j][1]; - numtyp dky = polar1[j].z; // rpole[j][2]; - numtyp dkz = polar1[j].w; // rpole[j][3]; - numtyp qkxx = polar2[j].x; // rpole[j][4]; - numtyp qkxy = polar2[j].y; // rpole[j][5]; - numtyp qkxz = polar2[j].z; // rpole[j][6]; - numtyp qkyy = polar2[j].w; // rpole[j][8]; - numtyp qkyz = polar3[j].x; // rpole[j][9]; - numtyp qkzz = polar3[j].y; // rpole[j][12]; - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; + const numtyp4 pol1j = polar1[j]; + numtyp ck = pol1j.x; // rpole[j][0]; + numtyp dkx = pol1j.y; // rpole[j][1]; + numtyp dky = pol1j.z; // rpole[j][2]; + numtyp dkz = pol1j.w; // rpole[j][3]; + const numtyp4 pol2j = polar2[j]; + numtyp qkxx = pol2j.x; // rpole[j][4]; + numtyp qkxy = pol2j.y; // rpole[j][5]; + numtyp qkxz = pol2j.z; // rpole[j][6]; + numtyp qkyy = pol2j.w; // rpole[j][8]; + const numtyp4 pol3j = polar3[j]; + numtyp qkyz = pol3j.x; // rpole[j][9]; + numtyp qkzz = pol3j.y; // rpole[j][12]; + int jtype = pol3j.z; // amtype[j]; + int jgroup = pol3j.w; // amgroup[j]; numtyp factor_dscale, factor_pscale; const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; @@ -1185,14 +1011,17 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp rr3 = rr1 * r2inv; numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; - numtyp ukx = polar4[j].x; // uind[j][0]; - numtyp uky = polar4[j].y; // uind[j][1]; - numtyp ukz = polar4[j].z; // uind[j][2]; - numtyp ukxp = polar5[j].x; // uinp[j][0]; - numtyp ukyp = polar5[j].y; // uinp[j][1]; - numtyp ukzp = polar5[j].z; // uinp[j][2]; + const numtyp4 pol3j = polar3[j]; + int jtype = pol3j.z; // amtype[j]; + int jgroup = pol3j.w; // amgroup[j]; + const numtyp4 pol4j = polar4[j]; + numtyp ukx = pol4j.x; // uind[j][0]; + numtyp uky = pol4j.y; // uind[j][1]; + numtyp ukz = pol4j.z; // uind[j][2]; + const numtyp4 pol5j = polar5[j]; + numtyp ukxp = pol5j.x; // uinp[j][0]; + numtyp ukyp = pol5j.y; // uinp[j][1]; + numtyp ukzp = pol5j.z; // uinp[j][2]; numtyp factor_uscale; if (igroup == jgroup) factor_uscale = polar_uscale; @@ -1355,24 +1184,29 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, nbor_mem = dev_short_nbor; } - ci = polar1[i].x; // rpole[i][0]; - dix = polar1[i].y; // rpole[i][1]; - diy = polar1[i].z; // rpole[i][2]; - diz = polar1[i].w; // rpole[i][3]; - qixx = polar2[i].x; // rpole[i][4]; - qixy = polar2[i].y; // rpole[i][5]; - qixz = polar2[i].z; // rpole[i][6]; - qiyy = polar2[i].w; // rpole[i][8]; - qiyz = polar3[i].x; // rpole[i][9]; - qizz = polar3[i].y; // rpole[i][12]; - itype = polar3[i].z; // amtype[i]; - igroup = polar3[i].w; // amgroup[i]; - uix = polar4[i].x; // uind[i][0]; - uiy = polar4[i].y; // uind[i][1]; - uiz = polar4[i].z; // uind[i][2]; - uixp = polar5[i].x; // uinp[i][0]; - uiyp = polar5[i].y; // uinp[i][1]; - uizp = polar5[i].z; // uinp[i][2]; + const numtyp4 pol1i = polar1[i]; + ci = pol1i.x; // rpole[i][0]; + dix = pol1i.y; // rpole[i][1]; + diy = pol1i.z; // rpole[i][2]; + diz = pol1i.w; // rpole[i][3]; + const numtyp4 pol2i = polar2[i]; + qixx = pol2i.x; // rpole[i][4]; + qixy = pol2i.y; // rpole[i][5]; + qixz = pol2i.z; // rpole[i][6]; + qiyy = pol2i.w; // rpole[i][8]; + const numtyp4 pol3i = polar3[i]; + qiyz = pol3i.x; // rpole[i][9]; + qizz = pol3i.y; // rpole[i][12]; + itype = pol3i.z; // amtype[i]; + igroup = pol3i.w; // amgroup[i]; + const numtyp4 pol4i = polar4[i]; + uix = pol4i.x; // uind[i][0]; + uiy = pol4i.y; // uind[i][1]; + uiz = pol4i.z; // uind[i][2]; + const numtyp4 pol5i = polar5[i]; + uixp = pol5i.x; // uinp[i][0]; + uiyp = pol5i.y; // uinp[i][1]; + uizp = pol5i.z; // uinp[i][2]; // debug: // xi__ = ix; xi__.w = itype; @@ -1398,24 +1232,29 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp r = ucl_sqrt(r2); + const numtyp4 pol1j = polar1[j]; numtyp ck = polar1[j].x; // rpole[j][0]; numtyp dkx = polar1[j].y; // rpole[j][1]; numtyp dky = polar1[j].z; // rpole[j][2]; numtyp dkz = polar1[j].w; // rpole[j][3]; - numtyp qkxx = polar2[j].x; // rpole[j][4]; - numtyp qkxy = polar2[j].y; // rpole[j][5]; - numtyp qkxz = polar2[j].z; // rpole[j][6]; - numtyp qkyy = polar2[j].w; // rpole[j][8]; - numtyp qkyz = polar3[j].x; // rpole[j][9]; - numtyp qkzz = polar3[j].y; // rpole[j][12]; - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; - numtyp ukx = polar4[j].x; // uind[j][0]; - numtyp uky = polar4[j].y; // uind[j][1]; - numtyp ukz = polar4[j].z; // uind[j][2]; - numtyp ukxp = polar5[j].x; // uinp[j][0]; - numtyp ukyp = polar5[j].y; // uinp[j][1]; - numtyp ukzp = polar5[j].z; // uinp[j][2]; + const numtyp4 pol2j = polar2[j]; + numtyp qkxx = pol2j.x; // rpole[j][4]; + numtyp qkxy = pol2j.y; // rpole[j][5]; + numtyp qkxz = pol2j.z; // rpole[j][6]; + numtyp qkyy = pol2j.w; // rpole[j][8]; + const numtyp4 pol3j = polar3[j]; + numtyp qkyz = pol3j.x; // rpole[j][9]; + numtyp qkzz = pol3j.y; // rpole[j][12]; + int jtype = pol3j.z; // amtype[j]; + int jgroup = pol3j.w; // amgroup[j]; + const numtyp4 pol4j = polar4[j]; + numtyp ukx = pol4j.x; // uind[j][0]; + numtyp uky = pol4j.y; // uind[j][1]; + numtyp ukz = pol4j.z; // uind[j][2]; + const numtyp4 pol5j = polar5[j]; + numtyp ukxp = pol5j.x; // uinp[j][0]; + numtyp ukyp = pol5j.y; // uinp[j][1]; + numtyp ukzp = pol5j.z; // uinp[j][2]; numtyp factor_dscale, factor_pscale, factor_uscale; const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; From 2428f1f4d527cd053d6d587632219eb829fc844d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 22 Sep 2021 11:44:41 -0500 Subject: [PATCH 047/181] Updated hippo kernels --- lib/gpu/lal_hippo.cu | 208 ++++++++++++++++++++++++------------------- 1 file changed, 115 insertions(+), 93 deletions(-) diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index a21afe6cd8..07df4c6ad0 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -55,7 +55,7 @@ _texture( q_tex,int2); #define local_allocate_store_ufld() \ __local acctyp red_acc[6][BLOCK_PAIR]; -#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ +#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ tep) \ if (t_per_atom>1) { \ red_acc[0][tid]=tq.x; \ @@ -225,7 +225,7 @@ _texture( q_tex,int2); #define local_allocate_store_ufld() -#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ +#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i, \ tep) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ @@ -636,7 +636,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp term1,term2,term3; numtyp term4,term5,term6; numtyp bn[6]; - numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; int numj, nbor, nbor_end; const __global int* nbor_mem=dev_packed; @@ -655,16 +654,19 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, nbor_mem = dev_short_nbor; } - ci = polar1[i].x; // rpole[i][0]; - dix = polar1[i].y; // rpole[i][1]; - diy = polar1[i].z; // rpole[i][2]; - diz = polar1[i].w; // rpole[i][3]; - qixx = polar2[i].x; // rpole[i][4]; - qixy = polar2[i].y; // rpole[i][5]; - qixz = polar2[i].z; // rpole[i][6]; - qiyy = polar2[i].w; // rpole[i][8]; - qiyz = polar3[i].x; // rpole[i][9]; - qizz = polar3[i].y; // rpole[i][12]; + const numtyp4 pol1i = polar1[i]; + numtyp ci = pol1i.x; // rpole[i][0]; + numtyp dix = pol1i.y; // rpole[i][1]; + numtyp diy = pol1i.z; // rpole[i][2]; + numtyp diz = pol1i.w; // rpole[i][3]; + const numtyp4 pol2i = polar2[i]; + numtyp qixx = pol2i.x; // rpole[i][4]; + numtyp qixy = pol2i.y; // rpole[i][5]; + numtyp qixz = pol2i.z; // rpole[i][6]; + numtyp qiyy = pol2i.w; // rpole[i][8]; + const numtyp4 pol3i = polar3[i]; + numtyp qiyz = pol3i.x; // rpole[i][9]; + numtyp qizz = pol3i.y; // rpole[i][12]; for ( ; nboroff2) continue; numtyp r = ucl_sqrt(r2); - numtyp ck = polar1[j].x; // rpole[j][0]; - numtyp dkx = polar1[j].y; // rpole[j][1]; - numtyp dky = polar1[j].z; // rpole[j][2]; - numtyp dkz = polar1[j].w; // rpole[j][3]; - numtyp qkxx = polar2[j].x; // rpole[j][4]; - numtyp qkxy = polar2[j].y; // rpole[j][5]; - numtyp qkxz = polar2[j].z; // rpole[j][6]; - numtyp qkyy = polar2[j].w; // rpole[j][8]; - numtyp qkyz = polar3[j].x; // rpole[j][9]; - numtyp qkzz = polar3[j].y; // rpole[j][12]; - int jtype = polar3[j].z; // amtype[j]; - int jgroup = polar3[j].w; // amgroup[j]; + const numtyp4 pol1j = polar1[j]; + numtyp ck = pol1j.x; // rpole[j][0]; + numtyp dkx = pol1j.y; // rpole[j][1]; + numtyp dky = pol1j.z; // rpole[j][2]; + numtyp dkz = pol1j.w; // rpole[j][3]; + const numtyp4 pol2j = polar2[j]; + numtyp qkxx = pol2j.x; // rpole[j][4]; + numtyp qkxy = pol2j.y; // rpole[j][5]; + numtyp qkxz = pol2j.z; // rpole[j][6]; + numtyp qkyy = pol2j.w; // rpole[j][8]; + const numtyp4 pol3j = polar3[j]; + numtyp qkyz = pol3j.x; // rpole[j][9]; + numtyp qkzz = pol3j.y; // rpole[j][12]; + int jtype = pol3j.z; // amtype[j]; + int jgroup = pol3j.w; // amgroup[j]; const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)]; @@ -873,7 +878,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, } // ii Date: Thu, 23 Sep 2021 09:21:55 -0500 Subject: [PATCH 048/181] Started working on hippo/gpu --- src/GPU/pair_hippo_gpu.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index ce0051962b..be5d4afc2b 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -121,6 +121,9 @@ double hippo_gpu_bytes(); PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) { + amoeba = 0; + hippo = 1; + respa_enable = 0; reinitflag = 0; cpu_time = 0.0; @@ -131,9 +134,9 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_hal_ready = false; // always false for HIPPO gpu_repulsion_ready = false; // true for HIPPO when ready gpu_dispersion_real_ready = false; // true for HIPPO when ready - gpu_multipole_real_ready = true; - gpu_udirect2b_ready = true; - gpu_umutual2b_ready = true; + gpu_multipole_real_ready = false; + gpu_udirect2b_ready = false; + gpu_umutual2b_ready = false; gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); From ad8164dfc0ed38f20a304a140b080a94c6a110f9 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 24 Sep 2021 00:21:25 -0500 Subject: [PATCH 049/181] Fixed bugs in the dispersion real-space term for hippo. NOTE: CPU version filter out neighbors with zero special_disp --- lib/gpu/lal_hippo.cpp | 11 ++++++----- lib/gpu/lal_hippo.cu | 7 ++++--- src/GPU/pair_hippo_gpu.cpp | 4 ++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 7fa358e35a..07f8732bcb 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -210,10 +210,11 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, this->_aewald = aewald; const int red_blocks=dispersion_real(eflag,vflag); - // leave the answers (forces, energies and virial) on the device, - // only copy them back in the last kernel (polar_real) - //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //device->add_ans_object(ans); + // only copy them back if this is the last kernel + // otherwise, commenting out these two lines to leave the answers + // (forces, energies and virial) on the device until the last kernel + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); @@ -238,7 +239,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list for the cutoff off2_mpole, + // Build the short neighbor list for the cutoff off2_disp, // at this point mpole is the first kernel in a time step this->k_short_nbor.set_size(GX,BX); diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 07df4c6ad0..f9020cf9a6 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -512,7 +512,8 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, numtyp tk,tk2; numtyp damp3,damp5; numtyp ddamp; - numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)]; + const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)]; + numtyp factor_disp = sp_nonpol.z; // factor_disp = special_disp[sbmask15(j)]; if (ai != ak) { ai2 = ai * ai; @@ -547,7 +548,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, // apply damping and scaling factors for this interaction numtyp scale = factor_disp * damp*damp; - scale = scale - (numtyp )1.0; + scale = scale - (numtyp)1.0; numtyp e = -ci * ck * (expa+scale) / r6; numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r; numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7; @@ -562,7 +563,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, f.x += dedx; f.y += dedy; f.z += dedz; - + // increment the internal virial tensor components numtyp vxx = xr * dedx; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index be5d4afc2b..a6e7b9edc6 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -133,11 +133,11 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_hal_ready = false; // always false for HIPPO gpu_repulsion_ready = false; // true for HIPPO when ready - gpu_dispersion_real_ready = false; // true for HIPPO when ready + gpu_dispersion_real_ready = true; // true for HIPPO when ready gpu_multipole_real_ready = false; gpu_udirect2b_ready = false; gpu_umutual2b_ready = false; - gpu_polar_real_ready = true; + gpu_polar_real_ready = false; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } From e77df80ce22d0eff3fb8e1f84921f3c1b959609e Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 24 Sep 2021 16:44:43 -0500 Subject: [PATCH 050/181] Working hippo multipole real-space term, added helper functions in a separate file --- lib/gpu/lal_base_amoeba.h | 8 +- lib/gpu/lal_hippo.cpp | 109 ++++++++++- lib/gpu/lal_hippo.cu | 357 +++++++++++++++++++++++++++++++++++-- lib/gpu/lal_hippo.h | 15 ++ lib/gpu/lal_hippo_ext.cpp | 7 +- lib/gpu/lal_hippo_extra.h | 326 +++++++++++++++++++++++++++++++++ src/GPU/pair_hippo_gpu.cpp | 9 +- 7 files changed, 797 insertions(+), 34 deletions(-) create mode 100644 lib/gpu/lal_hippo_extra.h diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 40da00f176..997e7b21ed 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -143,7 +143,7 @@ class BaseAmoeba { double *charge, double *boxlo, double *prd); /// Compute multipole real-space with device neighboring - int** compute_multipole_real(const int ago, const int inum_full, const int nall, + virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, @@ -155,7 +155,7 @@ class BaseAmoeba { double *boxlo, double *prd, void **tep_ptr); /// Compute the real space part of the permanent field (udirect2b) with device neighboring - int** compute_udirect2b(const int ago, const int inum_full, const int nall, + virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -169,7 +169,7 @@ class BaseAmoeba { double *boxlo, double *prd, void **fieldp_ptr); /// Compute the real space part of the induced field (umutual2b) with device neighboring - int** compute_umutual2b(const int ago, const int inum_full, const int nall, + virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -183,7 +183,7 @@ class BaseAmoeba { double *boxlo, double *prd, void **fieldp_ptr); /// Compute polar real-space with device neighboring - int** compute_polar_real(const int ago, const int inum_full, const int nall, + virtual int** compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 07f8732bcb..fad749a185 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -56,6 +56,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -69,7 +70,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, if (success!=0) return success; + // specific to HIPPO k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion"); + _pval.alloc(this->_max_tep_size,*(this->ucl_device),UCL_READ_ONLY,UCL_READ_ONLY); // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; @@ -98,8 +101,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, for (int i = 0; i < max_amclass; i++) { host_write2[i].x = host_csix[i]; host_write2[i].y = host_adisp[i]; - host_write2[i].z = (numtyp)0; - host_write2[i].w = (numtyp)0; + host_write2[i].z = host_pcore[i]; + host_write2[i].w = host_palpha[i]; } coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); @@ -262,6 +265,93 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute multipole real-space +// --------------------------------------------------------------------------- +template +int** HippoT::compute_multipole_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double felec, + const double off2_mpole, double *host_q, + double *boxlo, double *prd, void **tep_ptr) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + + this->_off2_mpole = off2_mpole; + this->_felec = felec; + this->_aewald = aewald; + const int red_blocks=multipole_real(eflag,vflag); + + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + + this->hd_balancer.stop_timer(); + + // copy tep from device to host + + this->_tep.update_host(this->_max_tep_size*4,false); +/* + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_tep[4*i]); + printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Calculate the multipole real-space term, returning tep // --------------------------------------------------------------------------- @@ -290,13 +380,14 @@ int HippoT::multipole_real(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); - this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &this->_tep, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &this->_aewald, &this->_felec, - &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval, + &coeff_amtype, &coeff_amclass, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); return GX; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index f9020cf9a6..56da15f8aa 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -15,7 +15,8 @@ #if defined(NV_KERNEL) || defined(USE_HIP) #include -#include "lal_aux_fun1.h" +#include "lal_hippo_extra.h" +//#include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int #endif @@ -404,6 +405,318 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + repulsion = Pauli repulsion interactions + adapted from Tinker erepel1b() routine +------------------------------------------------------------------------- */ + +__kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict sp_nonpolar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global acctyp4 *restrict tep, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + const int t_per_atom, const numtyp aewald, + const numtyp off2, const numtyp cut2, + const numtyp c0, const numtyp c1, const numtyp c2, + const numtyp c3, const numtyp c4, const numtyp c5) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int l=0; l<6; l++) virial[l]=(acctyp)0; + } + + acctyp4 tq; + tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; + + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + if (iioff2) continue; + + const numtyp4 pol1j = polar1[j]; + numtyp ck = pol1j.x; // rpole[j][0]; + numtyp dkx = pol1j.y; // rpole[j][1]; + numtyp dky = pol1j.z; // rpole[j][2]; + numtyp dkz = pol1j.w; // rpole[j][3]; + const numtyp4 pol2j = polar2[j]; + numtyp qkxx = pol2j.x; // rpole[j][4]; + numtyp qkxy = pol2j.y; // rpole[j][5]; + numtyp qkxz = pol2j.z; // rpole[j][6]; + numtyp qkyy = pol2j.w; // rpole[j][8]; + const numtyp4 pol3j = polar3[j]; + numtyp qkyz = pol3j.x; // rpole[j][9]; + numtyp qkzz = pol3j.y; // rpole[j][12]; + int jtype = pol3j.z; // amtype[j]; + + numtyp sizk = coeff[jtype].x; // sizpr[jtype]; + numtyp dmpk = coeff[jtype].y; // dmppr[jtype]; + numtyp valk = coeff[jtype].z; // elepr[jtype]; + + const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)]; + numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)]; + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + numtyp dik = dix*dkx + diy*dky + diz*dkz; + numtyp qik = qix*qkx + qiy*qky + qiz*qkz; + numtyp diqk = dix*qkx + diy*qky + diz*qkz; + numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + qixx*qkxx + qiyy*qkyy + qizz*qkzz; + + // additional intermediates involving moments and distance + + numtyp dirx = diy*zr - diz*yr; + numtyp diry = diz*xr - dix*zr; + numtyp dirz = dix*yr - diy*xr; + numtyp dkrx = dky*zr - dkz*yr; + numtyp dkry = dkz*xr - dkx*zr; + numtyp dkrz = dkx*yr - dky*xr; + numtyp dikx = diy*dkz - diz*dky; + numtyp diky = diz*dkx - dix*dkz; + numtyp dikz = dix*dky - diy*dkx; + numtyp qirx = qiz*yr - qiy*zr; + numtyp qiry = qix*zr - qiz*xr; + numtyp qirz = qiy*xr - qix*yr; + numtyp qkrx = qkz*yr - qky*zr; + numtyp qkry = qkx*zr - qkz*xr; + numtyp qkrz = qky*xr - qkx*yr; + numtyp qikx = qky*qiz - qkz*qiy; + numtyp qiky = qkz*qix - qkx*qiz; + numtyp qikz = qkx*qiy - qky*qix; + numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz; + numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz; + numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz; + numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz; + numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz; + numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz; + numtyp qikrx = qizk*yr - qiyk*zr; + numtyp qikry = qixk*zr - qizk*xr; + numtyp qikrz = qiyk*xr - qixk*yr; + numtyp qkirx = qkzi*yr - qkyi*zr; + numtyp qkiry = qkxi*zr - qkzi*xr; + numtyp qkirz = qkyi*xr - qkxi*yr; + numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz; + numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz; + numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz; + numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz; + numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz; + numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz; + numtyp diqkrx = diqkz*yr - diqky*zr; + numtyp diqkry = diqkx*zr - diqkz*xr; + numtyp diqkrz = diqky*xr - diqkx*yr; + numtyp dkqirx = dkqiz*yr - dkqiy*zr; + numtyp dkqiry = dkqix*zr - dkqiz*xr; + numtyp dkqirz = dkqiy*xr - dkqix*yr; + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); + + // get reciprocal distance terms for this interaction + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + numtyp rr9 = (numtyp)7.0 * rr7 * r2inv; + numtyp rr11 = (numtyp)9.0 * rr9 * r2inv; + + // get damping coefficients for the Pauli repulsion energy + numtyp dmpik[11]; + damprep(r,r2,rr1,rr3,rr5,rr7,rr9,rr11,11,dmpi,dmpk,dmpik); + + // calculate intermediate terms needed for the energy + + numtyp term1 = vali*valk; + numtyp term2 = valk*dir - vali*dkr + dik; + numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); + numtyp term4 = dir*qkr - dkr*qir - 4.0*qik; + numtyp term5 = qir*qkr; + numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + + term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8]; + + // compute the Pauli repulsion energy for this interaction + + numtyp sizik = sizi * sizk * factor_repel; + numtyp e = sizik * eterm * rr1; + + // calculate intermediate terms for force and torque + + numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + + term4*dmpik[8] + term5*dmpik[10]; + term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6]; + term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6]; + term3 = (numtyp)2.0 * dmpik[4]; + term4 = (numtyp)2.0 * (-valk*dmpik[4] + dkr*dmpik[6] - qkr*dmpik[8]); + term5 = (numtyp)2.0 * (-vali*dmpik[4] - dir*dmpik[6] - qir*dmpik[8]); + numtyp term6 = (numtyp)4.0 * dmpik[6]; + + // compute the force components for this interaction + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + term4*qix + term5*qkx + term6*(qixk+qkxi); + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + term4*qiy + term5*qky + term6*(qiyk+qkyi); + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + term4*qiz + term5*qkz + term6*(qizk+qkzi); + frcx = frcx*rr1 + eterm*rr3*xr; + frcy = frcy*rr1 + eterm*rr3*yr; + frcz = frcz*rr1 + eterm*rr3*zr; + + // compute the torque components for this interaction + + numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - + term4*qirx - term6*(qikrx+qikx); + numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - + term4*qiry - term6*(qikry+qiky); + numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - + term4*qirz - term6*(qikrz+qikz); + ttmix = sizik * ttmix * rr1; + ttmiy = sizik * ttmiy * rr1; + ttmiz = sizik * ttmiz * rr1; + + // use energy switching if near the cutoff distance + + if (r2 > cut2) { + numtyp r3 = r2 * r; + numtyp r4 = r2 * r2; + numtyp r5 = r2 * r3; + numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0; + numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 + + (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1; + dtaper *= e * rr1; + e *= taper; + frcx = frcx*taper - dtaper*xr; + frcy = frcy*taper - dtaper*yr; + frcz = frcz*taper - dtaper*zr; + ttmix *= taper; + ttmiy *= taper; + ttmiz *= taper; + } + + energy += e; + + // increment force-based gradient and torque on atom I + + f.x += frcx; + f.y += frcy; + f.z += frcz; + tq.x += ttmix; + tq.y += ttmiy; + tq.z += ttmiz; + + // increment the internal virial tensor components + if (EVFLAG && vflag) { + numtyp vxx = -xr * frcx; + numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy); + numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz); + numtyp vyy = -yr * frcy; + numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); + numtyp vzz = -zr * frcz; + + virial[0] += vxx; + virial[1] += vyy; + virial[2] += vzz; + virial[3] += vxy; + virial[4] += vxz; + virial[5] += vyz; + } + } // nbor + + } // ii { const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -65,6 +66,18 @@ class Hippo : public BaseAmoeba { const double aewald, const double off2_disp, double *charge, double *boxlo, double *prd); + /// Compute multipole real-space with device neighboring + virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double felec, const double off2_mpole, double *charge, + double *boxlo, double *prd, void **tep_ptr); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -105,6 +118,8 @@ class Hippo : public BaseAmoeba { UCL_Kernel k_dispersion; + UCL_Vector _pval; + protected: bool _allocated; int dispersion_real(const int eflag, const int vflag); diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index b9e31e7b20..fa09e7bce4 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -38,6 +38,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -73,7 +74,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - host_csix, host_adisp, nlocal, nall, max_nbors, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, polar_dscale, polar_uscale); @@ -97,7 +99,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - host_csix, host_adisp, nlocal, nall, max_nbors, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, screen, polar_dscale, polar_uscale); diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h new file mode 100644 index 0000000000..890ce51121 --- /dev/null +++ b/lib/gpu/lal_hippo_extra.h @@ -0,0 +1,326 @@ +/// ************************************************************************** +// hippo_extra.h +// ------------------- +// Trung Dac Nguyen +// +// Device code for hippo math routines +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndactrung@gmail.com +// ***************************************************************************/* + +#ifndef LAL_HIPPO_EXTRA_H +#define LAL_HIPPO_EXTRA_H + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#else +#endif + +#define MY_PI2 (numtyp)1.57079632679489661923 +#define MY_PI4 (numtyp)0.78539816339744830962 + +/* ---------------------------------------------------------------------- + damprep generates coefficients for the Pauli repulsion + damping function for powers of the interatomic distance + + literature reference: + + J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An + Anisotropic, Atomic Multipole Model", Journal of Chemical Physics, + 150, 084104 (2019) +------------------------------------------------------------------------- */ + +ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, + const numtyp rr3, const numtyp rr5, const numtyp rr7, + const numtyp rr9, const numtyp rr11, const int rorder, + const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11]) +{ + numtyp r3,r4; + numtyp r5,r6,r7,r8; + numtyp s,ds,d2s; + numtyp d3s,d4s,d5s; + numtyp dmpi2,dmpk2; + numtyp dmpi22,dmpi23; + numtyp dmpi24,dmpi25; + numtyp dmpi26,dmpi27; + numtyp dmpk22,dmpk23; + numtyp dmpk24,dmpk25; + numtyp dmpk26; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp pre,term,tmp; + + // compute tolerance value for damping exponents + + eps = (numtyp)0.001; + diff = dmpi-dmpk; + if (diff < (numtyp)0) diff = -diff; + + // treat the case where alpha damping exponents are equal + + if (diff < eps) { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + r6 = r5 * r; + r7 = r6 * r; + dmpi2 = (numtyp)0.5 * dmpi; + dampi = dmpi2 * r; + expi = ucl_exp(-dampi); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpi26 = dmpi25 * dmpi2; + pre = (numtyp)128.0; + s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi; + + ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0; + d2s = dmpi24 * expi * r5 / 9.0; + d3s = dmpi25 * expi * r6 / 45.0; + d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0; + if (rorder >= 11) { + r8 = r7 * r; + dmpi27 = dmpi2 * dmpi26; + d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0; + } + + // treat the case where alpha damping exponents are unequal + + } else { + r3 = r2 * r; + r4 = r3 * r; + r5 = r4 * r; + dmpi2 = 0.5 * dmpi; + dmpk2 = 0.5 * dmpk; + dampi = dmpi2 * r; + dampk = dmpk2 * r; + expi = exp(-dampi); + expk = exp(-dampk); + dmpi22 = dmpi2 * dmpi2; + dmpi23 = dmpi22 * dmpi2; + dmpi24 = dmpi23 * dmpi2; + dmpi25 = dmpi24 * dmpi2; + dmpk22 = dmpk2 * dmpk2; + dmpk23 = dmpk22 * dmpk2; + dmpk24 = dmpk23 * dmpk2; + dmpk25 = dmpk24 * dmpk2; + term = dmpi22 - dmpk22; + pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0); + tmp = 4.0 * dmpi2 * dmpk2 / term; + s = (dampi-tmp)*expk + (dampk+tmp)*expi; + + ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - + (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + + (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + + 4.0*dmpi2*dmpk2/term) * expi; + d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - + (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - + 4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + + (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + + 4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi; + d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - + (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - + (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + + (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + + (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + + 4.0*dmpi2*dmpk2/term) * expi; + + if (rorder >= 11) { + r6 = r5 * r; + dmpi26 = dmpi25 * dmpi2; + dmpk26 = dmpk25 * dmpk2; + d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + + dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - + (4.0/945.0)*dmpi2*dmpk26*r5/term - + (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - + (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + + dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + + (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + + (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; + } + } + + // convert partial derivatives into full derivatives + + s = s * rr1; + ds = ds * rr3; + d2s = d2s * rr5; + d3s = d3s * rr7; + d4s = d4s * rr9; + d5s = d5s * rr11; + dmpik[0] = 0.5 * pre * s * s; + dmpik[2] = pre * s * ds; + dmpik[4] = pre * (s*d2s + ds*ds); + dmpik[6] = pre * (s*d3s + 3.0*ds*d2s); + dmpik[8] = pre * (s*d4s + 4.0*ds*d3s + 3.0*d2s*d2s); + if (rorder >= 11) dmpik[10] = pre * (s*d5s + 5.0*ds*d4s + 10.0*d2s*d3s); +} + +/* ---------------------------------------------------------------------- + damppole generates coefficients for the charge penetration + damping function for powers of the interatomic distance + + literature references: + + L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the + Effective Fragment Potential Method: Theory and Application to + the Benzene Dimer", Journal of Computational Chemistry, 28, + 276-291 (2007) [Gordon f1 and f2 models] + + J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and + J. W. Ponder, "An Optimized Charge Penetration Model for Use with + the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19, + 276-291 (2017) +------------------------------------------------------------------------- */ + +ucl_inline void damppole(const numtyp r, const int rorder, + const numtyp alphai, const numtyp alphak, + numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11]) +{ + numtyp termi,termk; + numtyp termi2,termk2; + numtyp alphai2,alphak2; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampi3; + numtyp dampi4,dampi5; + numtyp dampi6,dampi7; + numtyp dampi8; + numtyp dampk2,dampk3; + numtyp dampk4,dampk5; + numtyp dampk6; + + // compute tolerance and exponential damping factors + + eps = 0.001; + diff = fabs(alphai-alphak); + dampi = alphai * r; + dampk = alphak * r; + expi = exp(-dampi); + expk = exp(-dampk); + + // core-valence charge penetration damping for Gordon f1 + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + dampi4 = dampi2 * dampi2; + dampi5 = dampi2 * dampi3; + dmpi[0] = 1.0 - (1.0 + 0.5*dampi)*expi; + dmpi[2] = 1.0 - (1.0 + dampi + 0.5*dampi2)*expi; + dmpi[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi; + dmpi[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + dampi4/30.0)*expi; + dmpi[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + 4.0*dampi4/105.0 + dampi5/210.0)*expi; + if (diff < eps) { + dmpk[0] = dmpi[0]; + dmpk[2] = dmpi[2]; + dmpk[4] = dmpi[4]; + dmpk[6] = dmpi[6]; + dmpk[8] = dmpi[8]; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + dampk4 = dampk2 * dampk2; + dampk5 = dampk2 * dampk3; + dmpk[0] = 1.0 - (1.0 + 0.5*dampk)*expk; + dmpk[2] = 1.0 - (1.0 + dampk + 0.5*dampk2)*expk; + dmpk[4] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk; + dmpk[6] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk; + dmpk[8] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + + 4.0*dampk4/105.0 + dampk5/210.0)*expk; + } + + // valence-valence charge penetration damping for Gordon f1 + + if (diff < eps) { + dampi6 = dampi3 * dampi3; + dampi7 = dampi3 * dampi4; + dmpik[0] = 1.0 - (1.0 + 11.0*dampi/16.0 + 3.0*dampi2/16.0 + + dampi3/48.0)*expi; + dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + + 7.0*dampi3/48.0 + dampi4/48.0)*expi; + dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/144.0)*expi; + dmpik[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/120.0 + dampi6/720.0)*expi; + dmpik[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + + dampi7/5040.0)*expi; + if (rorder >= 11) { + dampi8 = dampi4 * dampi4; + dmpik[10] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + + dampi7/5040.0 + dampi8/45360.0)*expi; + } + + } else { + alphai2 = alphai * alphai; + alphak2 = alphak * alphak; + termi = alphak2 / (alphak2-alphai2); + termk = alphai2 / (alphai2-alphak2); + termi2 = termi * termi; + termk2 = termk * termk; + dmpik[0] = 1.0 - termi2*(1.0 + 2.0*termk + 0.5*dampi)*expi - + termk2*(1.0 + 2.0*termi + 0.5*dampk)*expk; + dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi - + termk2*(1.0+dampk+0.5*dampk2)*expk - + 2.0*termi2*termk*(1.0+dampi)*expi - + 2.0*termk2*termi*(1.0+dampk)*expk; + dmpik[4] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + dampi2/3.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + dampk2/3.0)*expk; + dmpik[6] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + + dampi3/6.0 + dampi4/30.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + 2.0*dampi2/5.0 + dampi3/15.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + 2.0*dampk2/5.0 + dampk3/15.0)*expk; + dmpik[8] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + 4.0*dampi4/105.0 + dampi5/210.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + + 4.0*dampk4/105.0 + dampk5/210.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + 3.0*dampi2/7.0 + + 2.0*dampi3/21.0 + dampi4/105.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + 3.0*dampk2/7.0 + + 2.0*dampk3/21.0 + dampk4/105.0)*expk; + + if (rorder >= 11) { + dampi6 = dampi3 * dampi3; + dampk6 = dampk3 * dampk3; + dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + 5.0*dampi4/126.0 + 2.0*dampi5/315.0 + + dampi6/1890.0)*expi - + termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + + 2.0*dampk5/315.0 + dampk6/1890.0)*expk - + 2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + + dampi4/63.0 + dampi5/945.0)*expi - + 2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + + dampk4/63.0 + dampk5/945.0)*expk; + } + } +} + + + +#endif diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index a6e7b9edc6..91465abb82 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -59,6 +59,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -191,10 +192,10 @@ void PairHippoGPU::init_style() pdamp, thole, dirdamp, amtype2class, special_hal, special_repel, special_disp, special_mpole, special_polar_wscale, special_polar_piscale, - special_polar_pscale, csix, adisp, atom->nlocal, - atom->nlocal+atom->nghost, mnf, maxspecial, - maxspecial15, cell_size, gpu_mode, screen, - polar_dscale, polar_uscale, tq_size); + special_polar_pscale, csix, adisp, pcore, palpha, + atom->nlocal, atom->nlocal+atom->nghost, mnf, + maxspecial, maxspecial15, cell_size, gpu_mode, + screen, polar_dscale, polar_uscale, tq_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) From 78ef0d631fefab0af68d22371271cb8935c5e3b6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 25 Sep 2021 12:25:34 -0500 Subject: [PATCH 051/181] Working on the multipole real-space term of hippo --- lib/gpu/lal_base_amoeba.cpp | 10 ++- lib/gpu/lal_base_amoeba.h | 4 +- lib/gpu/lal_hippo.cpp | 133 +++++++++++++++++++++++++++++++----- lib/gpu/lal_hippo.cu | 83 ++++++++++++++-------- lib/gpu/lal_hippo.h | 16 ++++- lib/gpu/lal_hippo_ext.cpp | 4 +- src/GPU/pair_hippo_gpu.cpp | 20 +++--- 7 files changed, 207 insertions(+), 63 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index b8e927d6ce..1a299e902f 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -757,7 +757,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const { template void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, - double** uind, double** uinp) { + double** uind, double** uinp, double* pval) { // signal that we need to transfer extra data from the host atom->extra_data_unavail(); @@ -812,6 +812,14 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, pextra[idx+2] = uinp[i][2]; } } + + if (pval) { + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = pval[i]; + } + } } template diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 997e7b21ed..fc665ec731 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -131,7 +131,7 @@ class BaseAmoeba { bool &success); /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed - int** precompute(const int ago, const int inum_full, const int nall, + virtual int** precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, @@ -232,7 +232,7 @@ class BaseAmoeba { /// cast host arrays into a single array for atom->extra void cast_extra_data(int* amtype, int* amgroup, double** rpole, - double** uind, double** uinp); + double** uind, double** uinp, double* pval=nullptr); /// Per-atom arrays UCL_Vector _tep, _fieldp; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index fad749a185..10d75f2393 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -155,6 +155,102 @@ double HippoT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Hippo); } +// --------------------------------------------------------------------------- +// Prepare for multiple kernel calls in a time step: +// - reallocate per-atom arrays, if needed +// - transfer extra data from host to device +// - build the full neighbor lists for use by different kernels +// --------------------------------------------------------------------------- + +template +int** HippoT::precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // ------------------- Resize 1-5 neighbor arrays ------------------------ + + if (nall>this->_nmax) { + this->_nmax = nall; + this->dev_nspecial15.clear(); + this->dev_special15.clear(); + this->dev_special15_t.clear(); + this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + } + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + this->resize_atom(0,nall,success); + this->zero_timers(); + return nullptr; + } + + this->hd_balancer.balance(cpu_time); + int inum=this->hd_balancer.get_gpu_count(ago,inum_full); + this->ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + success); + if (!success) + return nullptr; + this->atom->cast_q_data(host_q); + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + this->hd_balancer.start_timer(); + } else { + this->atom->cast_x_data(host_x,host_type); + this->atom->cast_q_data(host_q); + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + this->hd_balancer.start_timer(); + this->atom->add_x_data(host_x,host_type); + } + this->atom->add_q_data(); + this->atom->add_extra_data(); + + *ilist=this->nbor->host_ilist.begin(); + *jnum=this->nbor->host_acc.begin(); + + this->device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, + boxlo, prd); + + // re-allocate dev_short_nbor if necessary + if (inum_full*(2+this->_max_nbors) > this->dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(inum_full)*1.10); + this->dev_short_nbor.resize((2+this->_max_nbors)*this->_nmax); + } + + return this->nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute dispersion real-space // --------------------------------------------------------------------------- @@ -201,9 +297,9 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, sublo, subhi, tag, + nullptr, nullptr, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, @@ -270,19 +366,20 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int** HippoT::compute_multipole_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double felec, - const double off2_mpole, double *host_q, - double *boxlo, double *prd, void **tep_ptr) { + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double* host_pval, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double felec, + const double off2_mpole, double *host_q, + double *boxlo, double *prd, void **tep_ptr) { this->acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -311,9 +408,9 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, sublo, subhi, tag, + nullptr, nullptr, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, @@ -380,7 +477,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); - this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval, + this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &coeff_amclass, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 56da15f8aa..bc5d9270d4 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -908,7 +908,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, - const __global numtyp *restrict pval, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, const __global numtyp4 *restrict sp_polar, @@ -945,6 +944,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp4* polar1 = (numtyp4*)(&extra[0]); numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + numtyp4* polar6 = (numtyp4*)(&extra[20*nall]); if (ii { const double gpu_split, FILE *_screen, const double polar_dscale, const double polar_uscale); + /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed + int** precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, double* host_pval, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); + /// Compute dispersion real-space with device neighboring int** compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -69,8 +81,8 @@ class Hippo : public BaseAmoeba { /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, + int *host_amgroup, double **host_rpole, double *host_pval, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index fa09e7bce4..390f713d98 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -140,7 +140,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, @@ -148,7 +148,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, sublo, subhi, + host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 91465abb82..6ac22e0721 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -79,7 +79,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double *sublo, double *subhi, tagint *tag, + double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -135,7 +135,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_hal_ready = false; // always false for HIPPO gpu_repulsion_ready = false; // true for HIPPO when ready gpu_dispersion_real_ready = true; // true for HIPPO when ready - gpu_multipole_real_ready = false; + gpu_multipole_real_ready = true; gpu_udirect2b_ready = false; gpu_umutual2b_ready = false; gpu_polar_real_ready = false; @@ -294,14 +294,14 @@ void PairHippoGPU::multipole_real() double felec = electric / am_dielectric; firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tq_pinned); + atom->type, amtype, amgroup, rpole, pval, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); From f8bc091cb8336a486823b4df25c9339a18808cf5 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 25 Sep 2021 13:17:06 -0500 Subject: [PATCH 052/181] Kept working on the multipole real-space term of hippo --- lib/gpu/lal_base_amoeba.cpp | 9 ++++--- lib/gpu/lal_hippo.cu | 43 ++++++++++++++++++--------------- src/AMOEBA/amoeba_multipole.cpp | 3 ++- src/GPU/pair_hippo_gpu.cpp | 2 +- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 1a299e902f..c4fdb8c9e5 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -793,8 +793,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, pextra[idx+3] = (numtyp)amgroup[i]; } + n += nstride*_nall; if (uind) { - n += nstride*_nall; for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; pextra[idx] = uind[i][0]; @@ -802,9 +802,9 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, pextra[idx+2] = uind[i][2]; } } - + + n += nstride*_nall; if (uinp) { - n += nstride*_nall; for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; pextra[idx] = uinp[i][0]; @@ -813,8 +813,9 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, } } + n += nstride*_nall; if (pval) { - n += nstride*_nall; + for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; pextra[idx] = pval[i]; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index bc5d9270d4..040ecf9308 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1032,6 +1032,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass]; numtyp valk = polar6[j].x; + if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n", + j, corei, corek, alphai, alphak, vali, valk); + // intermediates involving moments and separation distance numtyp dir = dix*xr + diy*yr + diz*zr; @@ -1149,22 +1152,22 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp dmpij[11]; damppole(r,11,alphai,alphak,dmpi,dmpj,dmpij); numtyp scalek = factor_mpole; - numtyp rr1i = bn[0] - (1.0-scalek*dmpi[0])*rr1; - numtyp rr3i = bn[1] - (1.0-scalek*dmpi[2])*rr3; - numtyp rr5i = bn[2] - (1.0-scalek*dmpi[4])*rr5; - numtyp rr7i = bn[3] - (1.0-scalek*dmpi[6])*rr7; - numtyp rr1k = bn[0] - (1.0-scalek*dmpj[0])*rr1; - numtyp rr3k = bn[1] - (1.0-scalek*dmpj[2])*rr3; - numtyp rr5k = bn[2] - (1.0-scalek*dmpj[4])*rr5; - numtyp rr7k = bn[3] - (1.0-scalek*dmpj[6])*rr7; - numtyp rr1ik = bn[0] - (1.0-scalek*dmpij[0])*rr1; - numtyp rr3ik = bn[1] - (1.0-scalek*dmpij[2])*rr3; - numtyp rr5ik = bn[2] - (1.0-scalek*dmpij[4])*rr5; - numtyp rr7ik = bn[3] - (1.0-scalek*dmpij[6])*rr7; - numtyp rr9ik = bn[4] - (1.0-scalek*dmpij[8])*rr9; - numtyp rr11ik = bn[5] - (1.0-scalek*dmpij[10])*rr11; - rr1 = bn[0] - (1.0-scalek)*rr1; - rr3 = bn[1] - (1.0-scalek)*rr3; + numtyp rr1i = bn[0] - ((numtyp)1.0-scalek*dmpi[0])*rr1; + numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3; + numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5; + numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7; + numtyp rr1k = bn[0] - ((numtyp)1.0-scalek*dmpj[0])*rr1; + numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpj[2])*rr3; + numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpj[4])*rr5; + numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpj[6])*rr7; + numtyp rr1ik = bn[0] - ((numtyp)1.0-scalek*dmpij[0])*rr1; + numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpij[2])*rr3; + numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpij[4])*rr5; + numtyp rr7ik = bn[3] - ((numtyp)1.0-scalek*dmpij[6])*rr7; + numtyp rr9ik = bn[4] - ((numtyp)1.0-scalek*dmpij[8])*rr9; + numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11; + rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1; + rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3; numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + term1i*rr1i + term1k*rr1k + term1ik*rr1ik + term2i*rr3i + term2k*rr3k + term2ik*rr3ik + @@ -1178,10 +1181,10 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, term3i*rr7i + term3k*rr7k + term3ik*rr7ik; term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik; term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik; - term3 = 2.0 * rr5ik; - term4 = -2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik); - term5 = -2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik); - term6 = 4.0 * rr7ik; + term3 = (numtyp)2.0 * rr5ik; + term4 = (numtyp)-2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik); + term5 = (numtyp)-2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik); + term6 = (numtyp)4.0 * rr7ik; rr3 = rr3ik; energy += e; diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index 3f5c9082e7..8d9e0c101d 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -379,7 +379,8 @@ void PairAmoeba::multipole_real() corek = pcore[jclass]; alphak = palpha[jclass]; valk = pval[j]; - + if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n", + j, corei, corek, alphai, alphak, vali, valk); /* printf("HIPPO MPOLE ij %d %d: pcore/alpha/val I %g %g %g: J %g %g %g\n", atom->tag[i],atom->tag[j],corei,alphai,vali,corek,alphak,valk); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 6ac22e0721..3bad2d4f52 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -292,7 +292,7 @@ void PairHippoGPU::multipole_real() // set the energy unit conversion factor for multipolar real-space calculation double felec = electric / am_dielectric; - + printf("hippo gpu multipole\n"); firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, pval, sublo, subhi, atom->tag, From edbed9c9c9c268701d7061dba651179931997c11 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 26 Sep 2021 00:13:40 -0500 Subject: [PATCH 053/181] Fixed bugs in HippoT::compute_dispersion_real and compute_multipole_real to ensure that answers only get copied back from device in the last kernel activated. --- lib/gpu/lal_hippo.cpp | 11 +++++------ lib/gpu/lal_hippo.cu | 3 --- lib/gpu/lal_hippo.h | 2 -- src/AMOEBA/amoeba_multipole.cpp | 9 +++++---- src/GPU/pair_hippo_gpu.cpp | 2 +- 5 files changed, 11 insertions(+), 16 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 10d75f2393..b4b84cc47d 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -72,7 +72,6 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, // specific to HIPPO k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion"); - _pval.alloc(this->_max_tep_size,*(this->ucl_device),UCL_READ_ONLY,UCL_READ_ONLY); // If atom type constants fit in shared memory use fast kernel int lj_types=ntypes; @@ -312,8 +311,8 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, // only copy them back if this is the last kernel // otherwise, commenting out these two lines to leave the answers // (forces, energies and virial) on the device until the last kernel - this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - this->device->add_ans_object(this->ans); + //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); @@ -430,9 +429,9 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, const int red_blocks=multipole_real(eflag,vflag); // leave the answers (forces, energies and virial) on the device, - // only copy them back in the last kernel (polar_real) - //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //device->add_ans_object(ans); + // only copy them back in the last kernel (this one, or polar_real once done) + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 040ecf9308..3bfd4f7019 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1032,9 +1032,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass]; numtyp valk = polar6[j].x; - if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n", - j, corei, corek, alphai, alphak, vali, valk); - // intermediates involving moments and separation distance numtyp dir = dix*xr + diy*yr + diz*zr; diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index ae604e8401..251f909b78 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -130,8 +130,6 @@ class Hippo : public BaseAmoeba { UCL_Kernel k_dispersion; - UCL_Vector _pval; - protected: bool _allocated; int dispersion_real(const int eflag, const int vflag); diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index 8d9e0c101d..945ee976eb 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -379,8 +379,7 @@ void PairAmoeba::multipole_real() corek = pcore[jclass]; alphak = palpha[jclass]; valk = pval[j]; - if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n", - j, corei, corek, alphai, alphak, vali, valk); + /* printf("HIPPO MPOLE ij %d %d: pcore/alpha/val I %g %g %g: J %g %g %g\n", atom->tag[i],atom->tag[j],corei,alphai,vali,corek,alphak,valk); @@ -421,6 +420,8 @@ void PairAmoeba::multipole_real() term2i*rr3i + term2k*rr3k + term2ik*rr3ik + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; + + // find damped multipole intermediates for force and torque de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + @@ -527,14 +528,14 @@ void PairAmoeba::multipole_real() // increment force-based gradient and torque on second site // commenting out j parts for DEBUGGING - + fmpole[j][0] -= frcx; fmpole[j][1] -= frcy; fmpole[j][2] -= frcz; tq[j][0] += ttmk[0]; tq[j][1] += ttmk[1]; tq[j][2] += ttmk[2]; - + // increment the virial due to pairwise Cartesian forces vxx = -xr * frcx; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 3bad2d4f52..6ac22e0721 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -292,7 +292,7 @@ void PairHippoGPU::multipole_real() // set the energy unit conversion factor for multipolar real-space calculation double felec = electric / am_dielectric; - printf("hippo gpu multipole\n"); + firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, pval, sublo, subhi, atom->tag, From 5193dcf8c558eaadd49d83a52d53869c3fe1a9cf Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 26 Sep 2021 00:56:29 -0500 Subject: [PATCH 054/181] Working on the polar real-space term of hippo --- lib/gpu/lal_hippo.cpp | 3 +- lib/gpu/lal_hippo.cu | 395 ++++++++++++++++++------------------ src/AMOEBA/amoeba_polar.cpp | 2 +- 3 files changed, 205 insertions(+), 195 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index b4b84cc47d..12bf9cfd3c 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -597,7 +597,8 @@ int HippoT::polar_real(const int eflag, const int vflag) { } this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + this->k_polar.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 3bfd4f7019..afc3cf10af 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1642,7 +1642,8 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, - const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict coeff_amtype, + const __global numtyp4 *restrict coeff_amclass, const __global numtyp4 *restrict sp_polar, const __global int *dev_nbor, const __global int *dev_packed, @@ -1683,6 +1684,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); numtyp4* polar4 = (numtyp4*)(&extra[12*nall]); numtyp4* polar5 = (numtyp4*)(&extra[16*nall]); + numtyp4* polar6 = (numtyp4*)(&extra[20*nall]); //numtyp4 xi__; @@ -1749,8 +1751,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, // debug: // xi__ = ix; xi__.w = itype; - numtyp pdi = coeff[itype].x; - numtyp pti = coeff[itype].y; + numtyp corei = coeff_amclass[itype].z; // pcore[iclass]; + numtyp alphai = coeff_amclass[itype].w; // palpha[iclass]; + numtyp vali = polar6[i].x; for ( ; nbor Date: Sun, 26 Sep 2021 09:11:09 -0500 Subject: [PATCH 055/181] Fixed bugs in the polar real kernel in hippo, getting closer.. --- lib/gpu/lal_hippo.cpp | 92 +++++++++++++++++++- lib/gpu/lal_hippo.cu | 171 +++++++++++++------------------------ lib/gpu/lal_hippo.h | 13 +++ lib/gpu/lal_hippo_ext.cpp | 4 +- src/GPU/pair_hippo_gpu.cpp | 6 +- 5 files changed, 169 insertions(+), 117 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 12bf9cfd3c..0f87104832 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // leave the answers (forces, energies and virial) on the device, // only copy them back in the last kernel (this one, or polar_real once done) - this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - this->device->add_ans_object(this->ans); + //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); @@ -568,6 +568,94 @@ int HippoT::umutual2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute polar real-space +// --------------------------------------------------------------------------- +template +int** HippoT::compute_polar_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *host_pval, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double felec, + const double off2_polar, double *host_q, + double *boxlo, double *prd, void **tep_ptr) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, host_pval, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + + this->_off2_polar = off2_polar; + this->_felec = felec; + this->_aewald = aewald; + const int red_blocks=polar_real(eflag,vflag); + + // only copy answers (forces, energies and virial) back from the device + // in the last kernel (which is polar_real here) + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); + + this->hd_balancer.stop_timer(); + + // copy tep from device to host + + this->_tep.update_host(this->_max_tep_size*4,false); +/* + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_tep[4*i]); + printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Calculate the polar real-space term, returning tep // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index afc3cf10af..1f9c14d4da 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1753,7 +1753,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp corei = coeff_amclass[itype].z; // pcore[iclass]; numtyp alphai = coeff_amclass[itype].w; // palpha[iclass]; - numtyp vali = polar6[i].x; + numtyp vali = polar6[i].x; for ( ; nbor { const double aewald, const double felec, const double off2_mpole, double *charge, double *boxlo, double *prd, void **tep_ptr); + /// Compute polar real-space with device neighboring + virtual int** compute_polar_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, double *host_pval, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double felec, const double off2_polar, + double *charge, double *boxlo, double *prd, void **tep_ptr); + /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 390f713d98..1851c3aba3 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -194,7 +194,7 @@ int** hippo_gpu_compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, @@ -202,7 +202,7 @@ int** hippo_gpu_compute_polar_real(const int ago, const int inum_full, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 6ac22e0721..23395e5fe3 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -108,7 +108,7 @@ int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; gpu_udirect2b_ready = false; gpu_umutual2b_ready = false; - gpu_polar_real_ready = false; + gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -1089,7 +1089,7 @@ void PairHippoGPU::polar_real() firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, - rpole, uind, uinp, sublo, subhi, + rpole, uind, uinp, pval, sublo, subhi, atom->tag, atom->nspecial, atom->special, atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, From 2efd841a7e29248170820ef1b6a079fa156baf07 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 27 Sep 2021 11:35:35 -0500 Subject: [PATCH 056/181] Trying to find the difference in the neighbor list build in hippo vs amoeba --- lib/gpu/lal_hippo.cpp | 6 +-- lib/gpu/lal_hippo.cu | 17 +++++-- lib/gpu/lal_hippo_extra.h | 92 ++++++++++++++++++------------------- src/AMOEBA/amoeba_polar.cpp | 12 +++-- src/AMOEBA/pair_amoeba.cpp | 45 +++++++++++++++++- src/GPU/pair_hippo_gpu.cpp | 4 +- 6 files changed, 114 insertions(+), 62 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 0f87104832..77bbebbb9a 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // leave the answers (forces, energies and virial) on the device, // only copy them back in the last kernel (this one, or polar_real once done) - //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //this->device->add_ans_object(this->ans); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); @@ -444,7 +444,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 1f9c14d4da..95f18db7d2 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1032,6 +1032,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass]; numtyp valk = polar6[j].x; + if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole); // intermediates involving moments and separation distance numtyp dir = dix*xr + diy*yr + diz*zr; @@ -1772,7 +1773,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - + const numtyp4 pol1j = polar1[j]; numtyp ck = polar1[j].x; // rpole[j][0]; numtyp dkx = polar1[j].y; // rpole[j][1]; @@ -1800,6 +1801,11 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)]; + // NOTE: for in.water_box/water_hexamer.hippo: there exist wscale = 0.2 + //if (factor_wscale < (numtyp)1.0) continue; //factor_wscale = (numtyp)0; + + //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, r, factor_wscale); + if (igroup == jgroup) { factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)]; factor_uscale = polar_uscale; @@ -1910,7 +1916,8 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; dufld[5] += zr*tiz5 + zr*zr*tuir; - + + // get the field gradient for direct polarization force numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i; @@ -1929,7 +1936,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, term1k = rr3k - rr5k*xr*xr; term2k = (numtyp)2.0*rr5k*xr; term3k = rr7k*xr*xr - rr5k; - term4k = 2.0*rr5k; + term4k = (numtyp)2.0*rr5k; term5k = (numtyp)5.0*rr7k*xr; term6k = rr9k*xr*xr; tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i - @@ -2046,7 +2053,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp frcx = (numtyp)-2.0 * depx; numtyp frcy = (numtyp)-2.0 * depy; numtyp frcz = (numtyp)-2.0 * depz; - + // get the dEp/dR terms used for direct polarization force // poltyp == MUTUAL && hippo // tixx and tkxx @@ -2159,7 +2166,7 @@ __kernel void k_special15(__global int * dev_nbor, int which = sj >> SBBITS & 3; int j = sj & NEIGHMASK; tagint jtag = tag[j]; - + if (i == 0 && j < 20) printf("GPU: j = %d; jtag = %d\n", j, jtag); if (!which) { int offset=ii; for (int k=0; k= 11) { r8 = r7 * r; dmpi27 = dmpi2 * dmpi26; - d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0; + d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0; } // treat the case where alpha damping exponents are unequal @@ -97,12 +97,12 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, r3 = r2 * r; r4 = r3 * r; r5 = r4 * r; - dmpi2 = 0.5 * dmpi; - dmpk2 = 0.5 * dmpk; + dmpi2 = (numtyp)0.5 * dmpi; + dmpk2 = (numtyp)0.5 * dmpk; dampi = dmpi2 * r; dampk = dmpk2 * r; - expi = exp(-dampi); - expk = exp(-dampk); + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); dmpi22 = dmpi2 * dmpi2; dmpi23 = dmpi22 * dmpi2; dmpi24 = dmpi23 * dmpi2; @@ -112,34 +112,34 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, dmpk24 = dmpk23 * dmpk2; dmpk25 = dmpk24 * dmpk2; term = dmpi22 - dmpk22; - pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0); - tmp = 4.0 * dmpi2 * dmpk2 / term; + pre = (numtyp)8192.0 * dmpi23 * dmpk23 / ucl_powr(term,(numtyp)4.0); + tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term; s = (dampi-tmp)*expk + (dampk+tmp)*expi; - ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; - d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - - (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + - (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + - 4.0*dmpi2*dmpk2/term) * expi; - d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - - (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - - 4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + - (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + - (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + - 4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi; - d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + - dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - - (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - - (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + - dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + - (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + - (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + + ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - + ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + + ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + (numtyp)4.0*dmpi2*dmpk2/term) * expi; + d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - + (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - + (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + + ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi; + d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - + ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - + ((numtyp)12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + + (4.0/105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/21.0)*dmpi24*dmpk2*r3/term + + (12.0/7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; if (rorder >= 11) { @@ -217,8 +217,8 @@ ucl_inline void damppole(const numtyp r, const int rorder, diff = fabs(alphai-alphak); dampi = alphai * r; dampk = alphak * r; - expi = exp(-dampi); - expk = exp(-dampk); + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); // core-valence charge penetration damping for Gordon f1 @@ -308,15 +308,15 @@ ucl_inline void damppole(const numtyp r, const int rorder, if (rorder >= 11) { dampi6 = dampi3 * dampi3; dampk6 = dampk3 * dampk3; - dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - 5.0*dampi4/126.0 + 2.0*dampi5/315.0 + - dampi6/1890.0)*expi - - termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + - 2.0*dampk5/315.0 + dampk6/1890.0)*expk - - 2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + - dampi4/63.0 + dampi5/945.0)*expi - - 2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + - dampk4/63.0 + dampk5/945.0)*expk; + dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + + dampi6/(numtyp)1890.0)*expi - + termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + 5.0*dampk4/(numtyp)126.0 + + (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + + dampi4/63.0 + dampi5/(numtyp)945.0)*expi - + (numtyp)2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + + dampk4/63.0 + dampk5/(numtyp)945.0)*expk; } } } diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index e6b3e6ef70..4fa8a5d892 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -366,6 +366,7 @@ void PairAmoeba::polar_real() yr = x[j][1] - yi; zr = x[j][2] - zi; r2 = xr*xr + yr*yr + zr*zr; + if (r2 > off2) continue; jtype = amtype[j]; @@ -393,7 +394,7 @@ void PairAmoeba::polar_real() factor_uscale = 1.0; } } - + //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale); r = sqrt(r2); ck = rpole[j][0]; dkx = rpole[j][1]; @@ -567,7 +568,7 @@ void PairAmoeba::polar_real() ufld[j][0] += tkx3 + xr*tukr; ufld[j][1] += tky3 + yr*tukr; ufld[j][2] += tkz3 + zr*tukr; - + // get induced dipole field gradient used for quadrupole torques if (amoeba) { @@ -579,7 +580,6 @@ void PairAmoeba::polar_real() tkz5 = 2.0 * (psr5*uiz+dsr5*uizp); tuir = -psr7*ukr - dsr7*ukrp; tukr = -psr7*uir - dsr7*uirp; - // reached here... } else if (hippo) { tix5 = 4.0 * (rr5i*ukx); tiy5 = 4.0 * (rr5i*uky); @@ -597,7 +597,6 @@ void PairAmoeba::polar_real() dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir; dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir; dufld[i][5] += zr*tiz5 + zr*zr*tuir; - dufld[j][0] -= xr*tkx5 + xr*xr*tukr; dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr; dufld[j][2] -= yr*tky5 + yr*yr*tukr; @@ -668,7 +667,7 @@ void PairAmoeba::polar_real() frcx = depx; frcy = depy; frcz = depz; - + // get the dEp/dR terms used for direct polarization force term1 = bn[2] - psc3*rr5; @@ -855,6 +854,7 @@ void PairAmoeba::polar_real() frcx = -2.0 * depx; frcy = -2.0 * depy; frcz = -2.0 * depz; + } // get the dtau/dr terms used for mutual polarization force @@ -1199,6 +1199,8 @@ void PairAmoeba::polar_real() torque2force(i,tep,fix,fiy,fiz,fpolar); + //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, tep[0], tep[1], tep[2]); + iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 5157739f0e..1ff35e7ce1 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -242,6 +242,47 @@ void PairAmoeba::compute(int eflag, int vflag) time_induce = time_polar = time_qxfer = 0.0; } + { // DEBUGGING + double **x = atom->x; + int inum,jnum; + int *ilist,*jlist,*numneigh,**firstneigh; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + if (use_ewald) choose(MPOLE_LONG); + else choose(MPOLE); + + int i,ii,j,jj; + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + double xi = x[i][0]; + double yi = x[i][1]; + double zi = x[i][2]; + + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + double factor_mpole = special_mpole[sbmask15(j)]; + j &= NEIGHMASK15; + + double xr = x[j][0] - xi; + double yr = x[j][1] - yi; + double zr = x[j][2] - zi; + double r2 = xr*xr + yr*yr + zr*zr; + if (r2 > off2) continue; + double r = sqrt(r2); + if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole); + } + } + + + } // DEBUGGING + double evdwl; evdwl = 0.0; @@ -973,8 +1014,8 @@ void PairAmoeba::init_style() int irequest = neighbor->request(this,instance_me); // for DEBUGGING with GPU - //neighbor->requests[irequest]->half = 0; - //neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; // open debug output files // names are hard-coded diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 23395e5fe3..4da0056029 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; gpu_udirect2b_ready = false; gpu_umutual2b_ready = false; - gpu_polar_real_ready = true; + gpu_polar_real_ready = false; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -1137,6 +1137,8 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, _tq[2] = tq_ptr[4*i+2]; torque2force(i,_tq,fix,fiy,fiz,force_comp); + //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, _tq[0], _tq[1], _tq[2]); + iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; From c6148938e5682075c10aff97f4cc9992ca5abc65 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 27 Sep 2021 12:36:11 -0500 Subject: [PATCH 057/181] Debugging the neighbor list in hippo vs amoeba --- src/AMOEBA/pair_amoeba.cpp | 4 ++-- src/DIPOLE/pair_lj_cut_dipole_cut.cpp | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 1ff35e7ce1..59d85814ec 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -274,9 +274,9 @@ void PairAmoeba::compute(int eflag, int vflag) double yr = x[j][1] - yi; double zr = x[j][2] - zi; double r2 = xr*xr + yr*yr + zr*zr; - if (r2 > off2) continue; + //if (r2 > off2) continue; double r = sqrt(r2); - if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole); + if (i == 0) printf("j = %d: tag = %d; r = %f; factor_mpole = %f\n", j, r, atom->tag[j], factor_mpole); } } diff --git a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp index 0f8a7317c6..e536d9d76e 100644 --- a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp +++ b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp @@ -19,6 +19,7 @@ #include "atom.h" #include "neighbor.h" #include "neigh_list.h" +#include "neigh_request.h" #include "comm.h" #include "force.h" #include "memory.h" @@ -90,6 +91,8 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) numneigh = list->numneigh; firstneigh = list->firstneigh; + int maxsize = 10; + // loop over neighbors of my atoms for (ii = 0; ii < inum; ii++) { @@ -102,6 +105,13 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) jlist = firstneigh[i]; jnum = numneigh[i]; + double scale_dipole = 1.0; + if (jnum > maxsize) { + scale_dipole = maxsize; //1.0/(double)maxsize; + } else { + scale_dipole = jnum; //1.0/(double)jnum; + } + for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; @@ -207,7 +217,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) // total force - fq = factor_coul*qqrd2e; + fq = scale_dipole*factor_coul*qqrd2e; fx = fq*forcecoulx + delx*forcelj; fy = fq*forcecouly + dely*forcelj; fz = fq*forcecoulz + delz*forcelj; @@ -221,7 +231,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) torque[i][1] += fq*tiycoul; torque[i][2] += fq*tizcoul; - if (newton_pair || j < nlocal) { + if (newton_pair) { f[j][0] -= fx; f[j][1] -= fy; f[j][2] -= fz; @@ -362,7 +372,9 @@ void PairLJCutDipoleCut::init_style() if (!atom->q_flag || !atom->mu_flag || !atom->torque_flag) error->all(FLERR,"Pair dipole/cut requires atom attributes q, mu, torque"); - neighbor->request(this,instance_me); + int irequest = neighbor->request(this,instance_me); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; } /* ---------------------------------------------------------------------- From d27836952aa4a753c931935d4f818c734282b9f7 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 27 Sep 2021 16:12:49 -0500 Subject: [PATCH 058/181] Fixed a bug in neighbor.cpp to make special_flag consistent between amoeba and hippo (to be 2 instead of 0), that caused missing neighbors with hippo --- lib/gpu/lal_hippo.cpp | 4 ++-- src/AMOEBA/pair_amoeba.cpp | 45 ++------------------------------------ src/GPU/pair_hippo_gpu.cpp | 2 +- src/neighbor.cpp | 2 ++ 4 files changed, 7 insertions(+), 46 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 77bbebbb9a..d31370be73 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // leave the answers (forces, energies and virial) on the device, // only copy them back in the last kernel (this one, or polar_real once done) - this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - this->device->add_ans_object(this->ans); + //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 59d85814ec..5157739f0e 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -242,47 +242,6 @@ void PairAmoeba::compute(int eflag, int vflag) time_induce = time_polar = time_qxfer = 0.0; } - { // DEBUGGING - double **x = atom->x; - int inum,jnum; - int *ilist,*jlist,*numneigh,**firstneigh; - - inum = list->inum; - ilist = list->ilist; - numneigh = list->numneigh; - firstneigh = list->firstneigh; - - if (use_ewald) choose(MPOLE_LONG); - else choose(MPOLE); - - int i,ii,j,jj; - for (ii = 0; ii < inum; ii++) { - i = ilist[ii]; - double xi = x[i][0]; - double yi = x[i][1]; - double zi = x[i][2]; - - jlist = firstneigh[i]; - jnum = numneigh[i]; - - for (jj = 0; jj < jnum; jj++) { - j = jlist[jj]; - double factor_mpole = special_mpole[sbmask15(j)]; - j &= NEIGHMASK15; - - double xr = x[j][0] - xi; - double yr = x[j][1] - yi; - double zr = x[j][2] - zi; - double r2 = xr*xr + yr*yr + zr*zr; - //if (r2 > off2) continue; - double r = sqrt(r2); - if (i == 0) printf("j = %d: tag = %d; r = %f; factor_mpole = %f\n", j, r, atom->tag[j], factor_mpole); - } - } - - - } // DEBUGGING - double evdwl; evdwl = 0.0; @@ -1014,8 +973,8 @@ void PairAmoeba::init_style() int irequest = neighbor->request(this,instance_me); // for DEBUGGING with GPU - neighbor->requests[irequest]->half = 0; - neighbor->requests[irequest]->full = 1; + //neighbor->requests[irequest]->half = 0; + //neighbor->requests[irequest]->full = 1; // open debug output files // names are hard-coded diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 4da0056029..1067969c7b 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; gpu_udirect2b_ready = false; gpu_umutual2b_ready = false; - gpu_polar_real_ready = false; + gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } diff --git a/src/neighbor.cpp b/src/neighbor.cpp index 0ee4051d4b..dcb6a21b7c 100644 --- a/src/neighbor.cpp +++ b/src/neighbor.cpp @@ -527,6 +527,7 @@ void Neighbor::init() int flag=0; for (int isub=0; isub < ph->nstyles; ++isub) { if (force->pair_match("amoeba",0,isub) + || force->pair_match("hippo",0,isub) || force->pair_match("coul/wolf",0,isub) || force->pair_match("coul/dsf",0,isub) || force->pair_match("coul/exclude",0) @@ -537,6 +538,7 @@ void Neighbor::init() special_flag[1] = special_flag[2] = special_flag[3] = 2; } else { if (force->pair_match("amoeba",0) + || force->pair_match("hippo",0) || force->pair_match("coul/wolf",0) || force->pair_match("coul/dsf",0) || force->pair_match("coul/exclude",0) From 8d54547bc0693abb68bd6aa033a3375e6506846e Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 28 Sep 2021 00:50:33 -0500 Subject: [PATCH 059/181] Commented out debugging commands in the hippo kernels, added (numtyp) to numerics in hippo_extra, replaced fabs with explicit func --- lib/gpu/lal_hippo.cu | 2 - lib/gpu/lal_hippo_extra.h | 129 +++++++++++++++++++------------------- 2 files changed, 65 insertions(+), 66 deletions(-) diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 95f18db7d2..45361ed1fb 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1032,7 +1032,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass]; numtyp valk = polar6[j].x; - if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole); // intermediates involving moments and separation distance numtyp dir = dix*xr + diy*yr + diz*zr; @@ -2166,7 +2165,6 @@ __kernel void k_special15(__global int * dev_nbor, int which = sj >> SBBITS & 3; int j = sj & NEIGHMASK; tagint jtag = tag[j]; - if (i == 0 && j < 20) printf("GPU: j = %d; jtag = %d\n", j, jtag); if (!which) { int offset=ii; for (int k=0; k= 11) { r6 = r5 * r; @@ -168,12 +168,12 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, d3s = d3s * rr7; d4s = d4s * rr9; d5s = d5s * rr11; - dmpik[0] = 0.5 * pre * s * s; + dmpik[0] = (numtyp)0.5 * pre * s * s; dmpik[2] = pre * s * ds; dmpik[4] = pre * (s*d2s + ds*ds); - dmpik[6] = pre * (s*d3s + 3.0*ds*d2s); - dmpik[8] = pre * (s*d4s + 4.0*ds*d3s + 3.0*d2s*d2s); - if (rorder >= 11) dmpik[10] = pre * (s*d5s + 5.0*ds*d4s + 10.0*d2s*d3s); + dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s); + dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s); + if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s); } /* ---------------------------------------------------------------------- @@ -213,8 +213,9 @@ ucl_inline void damppole(const numtyp r, const int rorder, // compute tolerance and exponential damping factors - eps = 0.001; - diff = fabs(alphai-alphak); + eps = (numtyp)0.001; + diff = alphai-alphak; + if (diff < (numtyp)0) diff = -diff; dampi = alphai * r; dampk = alphak * r; expi = ucl_exp(-dampi); @@ -226,12 +227,12 @@ ucl_inline void damppole(const numtyp r, const int rorder, dampi3 = dampi * dampi2; dampi4 = dampi2 * dampi2; dampi5 = dampi2 * dampi3; - dmpi[0] = 1.0 - (1.0 + 0.5*dampi)*expi; - dmpi[2] = 1.0 - (1.0 + dampi + 0.5*dampi2)*expi; - dmpi[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi; - dmpi[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + dampi4/30.0)*expi; - dmpi[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - 4.0*dampi4/105.0 + dampi5/210.0)*expi; + dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi; + dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; + dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; + dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; + dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi; if (diff < eps) { dmpk[0] = dmpi[0]; dmpk[2] = dmpi[2]; @@ -243,12 +244,12 @@ ucl_inline void damppole(const numtyp r, const int rorder, dampk3 = dampk * dampk2; dampk4 = dampk2 * dampk2; dampk5 = dampk2 * dampk3; - dmpk[0] = 1.0 - (1.0 + 0.5*dampk)*expk; - dmpk[2] = 1.0 - (1.0 + dampk + 0.5*dampk2)*expk; - dmpk[4] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk; - dmpk[6] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk; - dmpk[8] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + - 4.0*dampk4/105.0 + dampk5/210.0)*expk; + dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk; + dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; + dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; + dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk; + dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk; } // valence-valence charge penetration damping for Gordon f1 @@ -256,22 +257,22 @@ ucl_inline void damppole(const numtyp r, const int rorder, if (diff < eps) { dampi6 = dampi3 * dampi3; dampi7 = dampi3 * dampi4; - dmpik[0] = 1.0 - (1.0 + 11.0*dampi/16.0 + 3.0*dampi2/16.0 + - dampi3/48.0)*expi; - dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + - 7.0*dampi3/48.0 + dampi4/48.0)*expi; - dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - dampi4/24.0 + dampi5/144.0)*expi; - dmpik[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - dampi4/24.0 + dampi5/120.0 + dampi6/720.0)*expi; - dmpik[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + - dampi7/5040.0)*expi; + dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + + dampi3/(numtyp)48.0)*expi; + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi; + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; + dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi; + dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi7/(numtyp)5040.0)*expi; if (rorder >= 11) { dampi8 = dampi4 * dampi4; - dmpik[10] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + - dampi7/5040.0 + dampi8/45360.0)*expi; + dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi; } } else { @@ -281,29 +282,29 @@ ucl_inline void damppole(const numtyp r, const int rorder, termk = alphai2 / (alphai2-alphak2); termi2 = termi * termi; termk2 = termk * termk; - dmpik[0] = 1.0 - termi2*(1.0 + 2.0*termk + 0.5*dampi)*expi - - termk2*(1.0 + 2.0*termi + 0.5*dampk)*expk; - dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi - - termk2*(1.0+dampk+0.5*dampk2)*expk - - 2.0*termi2*termk*(1.0+dampi)*expi - - 2.0*termk2*termi*(1.0+dampk)*expk; - dmpik[4] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi - - termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk - - 2.0*termi2*termk*(1.0 + dampi + dampi2/3.0)*expi - - 2.0*termk2*termi*(1.0 + dampk + dampk2/3.0)*expk; - dmpik[6] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + - dampi3/6.0 + dampi4/30.0)*expi - - termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk - - 2.0*termi2*termk*(1.0 + dampi + 2.0*dampi2/5.0 + dampi3/15.0)*expi - - 2.0*termk2*termi*(1.0 + dampk + 2.0*dampk2/5.0 + dampk3/15.0)*expk; - dmpik[8] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + - 4.0*dampi4/105.0 + dampi5/210.0)*expi - - termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + - 4.0*dampk4/105.0 + dampk5/210.0)*expk - - 2.0*termi2*termk*(1.0 + dampi + 3.0*dampi2/7.0 + - 2.0*dampi3/21.0 + dampi4/105.0)*expi - - 2.0*termk2*termi*(1.0 + dampk + 3.0*dampk2/7.0 + - 2.0*dampk3/21.0 + dampk4/105.0)*expk; + dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - + termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk; + dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk; + dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - + termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk; + dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + + (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + + (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk; if (rorder >= 11) { dampi6 = dampi3 * dampi3; @@ -311,12 +312,12 @@ ucl_inline void damppole(const numtyp r, const int rorder, dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + dampi6/(numtyp)1890.0)*expi - - termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + 5.0*dampk4/(numtyp)126.0 + + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + - dampi4/63.0 + dampi5/(numtyp)945.0)*expi - - (numtyp)2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + - dampk4/63.0 + dampk5/(numtyp)945.0)*expk; + dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + + dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk; } } } From e80eea56ba0c4548f4ffadf47529100d7d16179f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 28 Sep 2021 14:59:39 -0500 Subject: [PATCH 060/181] Added udirect2b and umutual2b for hippo --- cmake/Modules/Packages/GPU.cmake | 3 + lib/gpu/Opencl.makefile | 3 + lib/gpu/lal_amoeba.cu | 10 +- lib/gpu/lal_hippo.cpp | 155 ++++++++++++++++++++++- lib/gpu/lal_hippo.cu | 211 ++++++++++++++++--------------- lib/gpu/lal_hippo.h | 34 +++++ lib/gpu/lal_hippo_ext.cpp | 8 +- lib/gpu/lal_hippo_extra.h | 105 ++++++++++++++- src/AMOEBA/amoeba_induce.cpp | 2 +- src/GPU/pair_hippo_gpu.cpp | 26 ++-- 10 files changed, 426 insertions(+), 131 deletions(-) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 2b6977005d..cf5bcd2ea2 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -172,6 +172,7 @@ elseif(GPU_API STREQUAL "OPENCL") ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu + ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu ) foreach(GPU_KERNEL ${GPU_LIB_CU}) @@ -188,6 +189,7 @@ elseif(GPU_API STREQUAL "OPENCL") GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu) GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu) GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu) + GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu) list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h @@ -197,6 +199,7 @@ elseif(GPU_API STREQUAL "OPENCL") ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h + ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ) add_library(gpu STATIC ${GPU_LIB_SOURCES}) diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 2ff98827d4..64a2161f85 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra. $(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h $(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h; +$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h + $(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h; + $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@; diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index e4d129214a..1deb3e3bb5 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1064,7 +1064,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; - numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip + numtyp tdipdip[6]; tdipdip[0] = -bcn[0] + bcn[1]*xr*xr; tdipdip[1] = bcn[1]*xr*yr; tdipdip[2] = bcn[1]*xr*zr; @@ -1233,10 +1233,10 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; - numtyp ck = polar1[j].x; // rpole[j][0]; - numtyp dkx = polar1[j].y; // rpole[j][1]; - numtyp dky = polar1[j].z; // rpole[j][2]; - numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp ck = pol1j.x; // rpole[j][0]; + numtyp dkx = pol1j.y; // rpole[j][1]; + numtyp dky = pol1j.z; // rpole[j][2]; + numtyp dkz = pol1j.w; // rpole[j][3]; const numtyp4 pol2j = polar2[j]; numtyp qkxx = pol2j.x; // rpole[j][4]; numtyp qkxy = pol2j.y; // rpole[j][5]; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index d31370be73..caf910863f 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -489,6 +489,81 @@ int HippoT::multipole_real(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the permanent field +// --------------------------------------------------------------------------- +template +int** HippoT::compute_udirect2b(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double* host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, + void** fieldp_ptr) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, host_pval, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _fieldp array ------------------------ + + if (inum_full>this->_max_fieldp_size) { + this->_max_fieldp_size=static_cast(static_cast(inum_full)*1.10); + this->_fieldp.resize(this->_max_fieldp_size*8); + } + *fieldp_ptr=this->_fieldp.host.begin(); + + this->_off2_polar = off2_polar; + this->_aewald = aewald; + const int red_blocks=udirect2b(eflag,vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + this->_fieldp.update_host(this->_max_fieldp_size*8,false); +/* + printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", + this->_fieldp.cols(), _max_fieldp_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); + printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Calculate the real-space permanent field, returning field and fieldp // --------------------------------------------------------------------------- @@ -518,7 +593,8 @@ int HippoT::udirect2b(const int eflag, const int vflag) { } this->k_udirect2b.set_size(GX,BX); - this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, @@ -529,6 +605,80 @@ int HippoT::udirect2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the induced field +// --------------------------------------------------------------------------- +template +int** HippoT::compute_umutual2b(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, + void** fieldp_ptr) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer extra data from the host + // and build the neighbor lists if needed + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, host_pval, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _fieldp array ------------------------ + + if (inum_full>this->_max_fieldp_size) { + this->_max_fieldp_size=static_cast(static_cast(inum_full)*1.10); + this->_fieldp.resize(this->_max_fieldp_size*8); + } + *fieldp_ptr=this->_fieldp.host.begin(); + + this->_off2_polar = off2_polar; + this->_aewald = aewald; + const int red_blocks=umutual2b(eflag,vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + this->_fieldp.update_host(this->_max_fieldp_size*8,false); +/* + printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", + this->_fieldp.cols(), _max_fieldp_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); + printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Calculate the real-space induced field, returning field and fieldp // --------------------------------------------------------------------------- @@ -558,7 +708,8 @@ int HippoT::umutual2b(const int eflag, const int vflag) { } this->k_umutual2b.set_size(GX,BX); - this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &this->_aewald, diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 45361ed1fb..487e852baf 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -16,7 +16,6 @@ #if defined(NV_KERNEL) || defined(USE_HIP) #include #include "lal_hippo_extra.h" -//#include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int #endif @@ -985,10 +984,10 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp qiyz = pol3i.x; // rpole[i][9]; numtyp qizz = pol3i.y; // rpole[i][12]; itype = pol3i.z; // amtype[i]; - iclass = coeff_amtype[itype].w; // amtype2class[itype]; - numtyp corei = coeff_amclass[itype].z; // pcore[iclass]; - numtyp alphai = coeff_amclass[itype].w; // palpha[iclass]; + iclass = coeff_amtype[itype].w; // amtype2class[itype]; + numtyp corei = coeff_amclass[iclass].z; // pcore[iclass]; + numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass]; numtyp vali = polar6[i].x; for ( ; nbor { const double aewald, const double felec, const double off2_mpole, double *charge, double *boxlo, double *prd, void **tep_ptr); + /// Compute the real space part of the permanent field (udirect2b) with device neighboring + virtual int** compute_udirect2b(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double* host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, + void** fieldp_ptr); + + /// Compute the real space part of the induced field (umutual2b) with device neighboring + virtual int** compute_umutual2b(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_polar, + double *host_q, double *boxlo, double *prd, + void** fieldp_ptr); + /// Compute polar real-space with device neighboring virtual int** compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 1851c3aba3..16b697d88f 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -157,7 +157,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, @@ -166,7 +166,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full, bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr) { return HIPPOMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); @@ -175,7 +175,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full, int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, @@ -184,7 +184,7 @@ int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full, bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr) { return HIPPOMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h index a06ac4425c..cacee4ae72 100644 --- a/lib/gpu/lal_hippo_extra.h +++ b/lib/gpu/lal_hippo_extra.h @@ -59,7 +59,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, // compute tolerance value for damping exponents eps = (numtyp)0.001; - diff = dmpi-dmpk; + diff = dmpi-dmpk; // fabs(dmpi-dmpk) if (diff < (numtyp)0) diff = -diff; // treat the case where alpha damping exponents are equal @@ -322,6 +322,109 @@ ucl_inline void damppole(const numtyp r, const int rorder, } } +/* ---------------------------------------------------------------------- + dampdir = direct field damping coefficents + dampdir generates coefficients for the direct field damping + function for powers of the interatomic distance +------------------------------------------------------------------------- */ +ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk) +{ + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampk2; + numtyp dampi3,dampk3; + numtyp dampi4,dampk4; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; // fabs(alphai-alphak); + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // core-valence charge penetration damping for Gordon f1 (HIPPO) + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + dampi4 = dampi2 * dampi2; + dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; + dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; + dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; + if (diff < eps) { + dmpk[2] = dmpi[2]; + dmpk[4] = dmpi[4]; + dmpk[6] = dmpi[6]; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + dampk4 = dampk2 * dampk2; + dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; + dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; + dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk; + } +} + +/* ---------------------------------------------------------------------- + dampmut = mutual field damping coefficents + dampmut generates coefficients for the mutual field damping + function for powers of the interatomic distance +------------------------------------------------------------------------- */ + +ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5]) +{ + numtyp termi,termk; + numtyp termi2,termk2; + numtyp alphai2,alphak2; + numtyp eps,diff; + numtyp expi,expk; + numtyp dampi,dampk; + numtyp dampi2,dampi3; + numtyp dampi4,dampi5; + numtyp dampk2,dampk3; + + // compute tolerance and exponential damping factors + + eps = (numtyp)0.001; + diff = alphai-alphak; // fabs(alphai-alphak); + if (diff < (numtyp)0) diff = -diff; + dampi = alphai * r; + dampk = alphak * r; + expi = ucl_exp(-dampi); + expk = ucl_exp(-dampk); + + // valence-valence charge penetration damping for Gordon f1 (HIPPO) + + dampi2 = dampi * dampi; + dampi3 = dampi * dampi2; + if (diff < eps) { + dampi4 = dampi2 * dampi2; + dampi5 = dampi2 * dampi3; + dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + + 7.0*dampi3/48.0 + dampi4/48.0)*expi; + dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + + dampi4/24.0 + dampi5/144.0)*expi; + } else { + dampk2 = dampk * dampk; + dampk3 = dampk * dampk2; + alphai2 = alphai * alphai; + alphak2 = alphak * alphak; + termi = alphak2 / (alphak2-alphai2); + termk = alphai2 / (alphai2-alphak2); + termi2 = termi * termi; + termk2 = termk * termk; + dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi - + termk2*(1.0+dampk+0.5*dampk2)*expk - + 2.0*termi2*termk*(1.0+dampi)*expi - 2.0*termk2*termi*(1.0+dampk)*expk; + dmpik[4] = 1.0 - termi2*(1.0+dampi+0.5*dampi2 + dampi3/6.0)*expi - + termk2*(1.0+dampk+0.5*dampk2 + dampk3/6.00)*expk - + 2.0*termi2*termk *(1.0+dampi+dampi2/3.0)*expi - + 2.0*termk2*termi *(1.0+dampk+dampk2/3.0)*expk; + } +} #endif diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 5b855abdd0..617eb89fcd 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -1900,7 +1900,7 @@ void PairAmoeba::dampmut(double r, double alphai, double alphak, double *dmpik) ------------------------------------------------------------------------- */ void PairAmoeba::dampdir(double r, double alphai, double alphak, - double *dmpi, double *dmpk) + double dmpi[7], double dmpk[7]) { double eps,diff; double expi,expk; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 1067969c7b..f4cbf28561 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -89,7 +89,7 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -98,7 +98,7 @@ int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -136,8 +136,8 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_repulsion_ready = false; // true for HIPPO when ready gpu_dispersion_real_ready = true; // true for HIPPO when ready gpu_multipole_real_ready = true; - gpu_udirect2b_ready = false; - gpu_umutual2b_ready = false; + gpu_udirect2b_ready = true; + gpu_umutual2b_ready = true; gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); @@ -791,7 +791,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) firstneigh = hippo_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, - uind, uinp, sublo, subhi, atom->tag, + uind, uinp, pval, sublo, subhi, atom->tag, atom->nspecial, atom->special, atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, @@ -1015,14 +1015,14 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) else choose(POLAR); firstneigh = hippo_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - uind, uinp, sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success,aewald, off2, atom->q, - domain->boxlo, domain->prd, &fieldp_pinned); + atom->type, amtype, amgroup, rpole, + uind, uinp, pval, sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success,aewald, off2, atom->q, + domain->boxlo, domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); From bf88ab77fa6f07a1a8ffa94c2154b268965ffe7f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 28 Sep 2021 15:06:30 -0500 Subject: [PATCH 061/181] Cleaned up unused variables in kernel (to be continued) --- lib/gpu/lal_hippo.cu | 14 -------------- lib/gpu/lal_hippo_extra.h | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 487e852baf..f643f2b994 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1692,19 +1692,10 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, if (ii Date: Tue, 28 Sep 2021 17:28:33 -0500 Subject: [PATCH 062/181] Removed trailing spaces --- lib/gpu/lal_amoeba.cpp | 8 +- lib/gpu/lal_amoeba.cu | 80 ++++++++++---------- lib/gpu/lal_amoeba.h | 6 +- lib/gpu/lal_base_amoeba.cpp | 32 ++++---- lib/gpu/lal_base_amoeba.h | 2 +- lib/gpu/lal_hippo.cpp | 22 +++--- lib/gpu/lal_hippo.cu | 130 ++++++++++++++++---------------- lib/gpu/lal_hippo.h | 6 +- lib/gpu/lal_hippo_ext.cpp | 4 +- lib/gpu/lal_hippo_extra.h | 146 ++++++++++++++++++------------------ src/GPU/Install.sh | 2 + src/GPU/pair_amoeba_gpu.cpp | 62 +++++++-------- src/GPU/pair_hippo_gpu.cpp | 64 ++++++++-------- 13 files changed, 283 insertions(+), 281 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 8d9af4706e..917166c423 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -140,7 +140,7 @@ void AmoebaT::clear() { coeff_amclass.clear(); sp_polar.clear(); sp_nonpolar.clear(); - + this->clear_atomic(); } @@ -169,7 +169,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { // Build the short neighbor list for the cutoff off2_mpole, // at this point mpole is the first kernel in a time step - + this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), @@ -194,7 +194,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { - int ainum=this->ans->inum(); + int ainum=this->ans->inum(); if (ainum == 0) return 0; @@ -216,7 +216,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->short_nbor_polar_avail = true; } - + this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 1deb3e3bb5..fdb959f3e2 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -492,7 +492,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; //int jtype=jx.w; - + // Compute r12 numtyp xr = jx.x - ix.x; numtyp yr = jx.y - ix.y; @@ -500,7 +500,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp r2 = xr*xr + yr*yr + zr*zr; //if (r2>off2) continue; - + numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; @@ -533,12 +533,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; numtyp qkr = qkx*xr + qky*yr + qkz*zr; - + numtyp dik = dix*dkx + diy*dky + diz*dkz; numtyp qik = qix*qkx + qiy*qky + qiz*qkz; numtyp diqk = dix*qkx + diy*qky + diz*qkz; numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; - numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + qixx*qkxx + qiyy*qkyy + qizz*qkzz; // additional intermediates involving moments and distance @@ -585,11 +585,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; - numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); - numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); - numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); // get reciprocal distance terms for this interaction @@ -650,20 +650,20 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, // compute the force components for this interaction - numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + term4*qix + term5*qkx + term6*(qixk+qkxi); - numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + term4*qiy + term5*qky + term6*(qiyk+qkyi); - numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + term4*qiz + term5*qkz + term6*(qizk+qkzi); // compute the torque components for this interaction - numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - term4*qirx - term6*(qikrx+qikx); - numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - term4*qiry - term6*(qikry+qiky); - numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - term4*qirz - term6*(qikrz+qikz); // increment force-based gradient and torque on first site @@ -691,12 +691,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -898,7 +898,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; - + scalek = factor_pscale; bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; @@ -918,7 +918,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1044,7 +1044,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } // find terms needed later to compute mutual polarization - // if (poltyp != DIRECT) + // if (poltyp != DIRECT) numtyp scale3 = (numtyp)1.0; numtyp scale5 = (numtyp)1.0; numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] @@ -1056,7 +1056,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, scale3 = (numtyp)1.0 - expdamp; scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); } - + } else { // damp == 0: ??? } @@ -1071,17 +1071,17 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, tdipdip[3] = -bcn[0] + bcn[1]*yr*yr; tdipdip[4] = bcn[1]*yr*zr; tdipdip[5] = -bcn[0] + bcn[1]*zr*zr; - //if (i==0 && j == 10) + //if (i==0 && j == 10) // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; - + fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; - + _fieldp[0] += fid[0]; _fieldp[1] += fid[1]; _fieldp[2] += fid[2]; @@ -1093,7 +1093,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); - + const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; numtyp dkx = pol1j.y; // rpole[j][1]; @@ -1383,7 +1383,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp tiy3 = psr3*uky + dsr3*ukyp; numtyp tiz3 = psr3*ukz + dsr3*ukzp; numtyp tuir = -psr5*ukr - dsr5*ukrp; - + ufld[0] += tix3 + xr*tuir; ufld[1] += tiy3 + yr*tuir; ufld[2] += tiz3 + zr*tuir; @@ -1394,14 +1394,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp); numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp); tuir = -psr7*ukr - dsr7*ukrp; - + dufld[0] += xr*tix5 + xr*xr*tuir; dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir; dufld[2] += yr*tiy5 + yr*yr*tuir; dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir; dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; dufld[5] += zr*tiz5 + zr*zr*tuir; - + // get the dEd/dR terms used for direct polarization force term1 = bn[2] - dsc3*rr5; @@ -1473,7 +1473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp frcz = depz; // get the dEp/dR terms used for direct polarization force - + // tixx and tkxx term1 = bn[2] - psc3*rr5; term2 = bn[3] - psc5*rr7; @@ -1550,7 +1550,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, // get the dtau/dr terms used for mutual polarization force // poltyp == MUTUAL && amoeba - + term1 = bn[2] - usc3*rr5; term2 = bn[3] - usc5*rr7; term3 = usr5 + term1; @@ -1617,7 +1617,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // ii { UCL_D_Vec coeff_amtype; /// csix = coeff_amclass.x; adisp = coeff_amclass.y; UCL_D_Vec coeff_amclass; - /// Special polar values [0-4]: + /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, /// sp_polar.z = special_polar_piscale /// sp_polar.w = special_mpole UCL_D_Vec sp_polar; - /// Special nonpolar values [0-4]: + /// Special nonpolar values [0-4]: /// sp_nonpolar.x = special_hal /// sp_nonpolar.y special_repel /// sp_nonpolar.z = special_disp @@ -97,7 +97,7 @@ class Amoeba : public BaseAmoeba { int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); int polar_real(const int eflag, const int vflag); - + }; } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index c4fdb8c9e5..3728fbe85e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -106,7 +106,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _threads_per_atom); if (success!=0) return success; - + // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -121,7 +121,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _maxspecial=maxspecial; _maxspecial15=maxspecial15; - // allocate per-atom array tep + // allocate per-atom array tep int ef_nall=nlocal; //nall; if (ef_nall==0) @@ -250,7 +250,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, const double aewald, const double felec, + bool &success, const double aewald, const double felec, const double off2_polar, double *host_q, const int nlocal, double *boxlo, double *prd, void **tep_ptr) { acc_timers(); @@ -280,7 +280,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f dev_special15_t.clear(); dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } *tep_ptr=_tep.host.begin(); @@ -320,7 +320,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f _off2_polar = off2_polar; _felec = felec; const int red_blocks=polar_real(eflag,vflag); - + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); @@ -375,7 +375,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall dev_special15_t.clear(); dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } if (inum_full==0) { @@ -462,7 +462,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -509,7 +509,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -560,7 +560,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - + // ------------------- Resize _fieldp array ------------------------ if (inum_full>_max_fieldp_size) { @@ -698,7 +698,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -745,7 +745,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -809,7 +809,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, int idx = n+i*nstride; pextra[idx] = uinp[i][0]; pextra[idx+1] = uinp[i][1]; - pextra[idx+2] = uinp[i][2]; + pextra[idx+2] = uinp[i][2]; } } @@ -818,7 +818,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = pval[i]; + pextra[idx] = pval[i]; } } } @@ -846,7 +846,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); - + _compiled=true; #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) @@ -874,13 +874,13 @@ int BaseAmoebaT::add_onefive_neighbors() { int _nall=atom->nall(); int ainum=ans->inum(); int nbor_pitch=nbor->nbor_pitch(); - + k_special15.set_size(GX,BX); k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(), &atom->dev_tag, &dev_nspecial15, &dev_special15, &ainum, &_nall, &nbor_pitch, &_threads_per_atom); - + return GX; } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index fc665ec731..bd30fc3fbb 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -287,7 +287,7 @@ class BaseAmoeba { virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int polar_real(const int eflag, const int vflag) = 0; - + }; } diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index caf910863f..ac221f8376 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -145,7 +145,7 @@ void HippoT::clear() { coeff_amclass.clear(); sp_polar.clear(); sp_nonpolar.clear(); - + this->clear_atomic(); } @@ -199,7 +199,7 @@ int** HippoT::precompute(const int ago, const int inum_full, const int nall, this->dev_special15_t.clear(); this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); } if (inum_full==0) { @@ -286,7 +286,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -339,7 +339,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { // Build the short neighbor list for the cutoff off2_disp, // at this point mpole is the first kernel in a time step - + this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), @@ -397,7 +397,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -468,7 +468,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) { // Build the short neighbor list for the cutoff off2_mpole, // at this point mpole is the first kernel in a time step - + this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, &this->_nbor_data->begin(), @@ -537,7 +537,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - + // ------------------- Resize _fieldp array ------------------------ if (inum_full>this->_max_fieldp_size) { @@ -569,7 +569,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, // --------------------------------------------------------------------------- template int HippoT::udirect2b(const int eflag, const int vflag) { - int ainum=this->ans->inum(); + int ainum=this->ans->inum(); if (ainum == 0) return 0; @@ -591,7 +591,7 @@ int HippoT::udirect2b(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->short_nbor_polar_avail = true; } - + this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &coeff_amclass, &sp_polar, @@ -756,7 +756,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed - // NOTE: + // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once @@ -803,7 +803,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); } -*/ +*/ return firstneigh; // nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index f643f2b994..b282586efb 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -491,7 +491,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; //int jtype=jx.w; - + // Compute r12 numtyp xr = ix.x - jx.x; numtyp yr = ix.y - jx.y; @@ -499,7 +499,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp r2 = xr*xr + yr*yr + zr*zr; if (r2>off2) continue; - + const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; numtyp dkx = pol1j.y; // rpole[j][1]; @@ -514,7 +514,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp qkyz = pol3j.x; // rpole[j][9]; numtyp qkzz = pol3j.y; // rpole[j][12]; int jtype = pol3j.z; // amtype[j]; - + numtyp sizk = coeff[jtype].x; // sizpr[jtype]; numtyp dmpk = coeff[jtype].y; // dmppr[jtype]; numtyp valk = coeff[jtype].z; // elepr[jtype]; @@ -534,12 +534,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; numtyp qkr = qkx*xr + qky*yr + qkz*zr; - + numtyp dik = dix*dkx + diy*dky + diz*dkz; numtyp qik = qix*qkx + qiy*qky + qiz*qkz; numtyp diqk = dix*qkx + diy*qky + diz*qkz; numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; - numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + qixx*qkxx + qiyy*qkyy + qizz*qkzz; // additional intermediates involving moments and distance @@ -586,11 +586,11 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; - numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); - numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); - numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); // get reciprocal distance terms for this interaction @@ -616,7 +616,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); numtyp term4 = dir*qkr - dkr*qir - 4.0*qik; numtyp term5 = qir*qkr; - numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + + numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8]; // compute the Pauli repulsion energy for this interaction @@ -626,7 +626,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, // calculate intermediate terms for force and torque - numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + + numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + term4*dmpik[8] + term5*dmpik[10]; term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6]; term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6]; @@ -637,23 +637,23 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, // compute the force components for this interaction - numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + term4*qix + term5*qkx + term6*(qixk+qkxi); - numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + term4*qiy + term5*qky + term6*(qiyk+qkyi); - numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + term4*qiz + term5*qkz + term6*(qizk+qkzi); frcx = frcx*rr1 + eterm*rr3*xr; frcy = frcy*rr1 + eterm*rr3*yr; frcz = frcz*rr1 + eterm*rr3*zr; // compute the torque components for this interaction - - numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - + + numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - term4*qirx - term6*(qikrx+qikx); - numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - + numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - term4*qiry - term6*(qikry+qiky); - numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - + numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - term4*qirz - term6*(qikrz+qikz); ttmix = sizik * ttmix * rr1; ttmiy = sizik * ttmiy * rr1; @@ -706,7 +706,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // iioff2) continue; - + int jtype = polar3[j].z; // amtype[j]; int jclass = coeff_amtype[jtype].w; // amtype2class[jtype]; numtyp ck = coeff_amclass[jclass].x; // csix[jclass]; @@ -816,7 +816,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, numtyp dk = ak * r; numtyp expi = ucl_exp(-di); numtyp expk = ucl_exp(-dk); - + numtyp ai2,ak2; numtyp di4,di5; numtyp dk2,dk3; @@ -844,7 +844,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk; - ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + + ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0); } else { @@ -856,7 +856,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, } numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3; - + // apply damping and scaling factors for this interaction numtyp scale = factor_disp * damp*damp; @@ -892,7 +892,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, virial[4] += vzx; virial[5] += vzy; } // nbor - + } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; @@ -1043,12 +1043,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; numtyp qkr = qkx*xr + qky*yr + qkz*zr; - + numtyp dik = dix*dkx + diy*dky + diz*dkz; numtyp qik = qix*qkx + qiy*qky + qiz*qkz; numtyp diqk = dix*qkx + diy*qky + diz*qkz; numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz; - numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + + numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + qixx*qkxx + qiyy*qkyy + qizz*qkzz; // additional intermediates involving moments and distance @@ -1095,11 +1095,11 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; - numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - + numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz); - numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - + numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz); - numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - + numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz); // get reciprocal distance terms for this interaction @@ -1164,16 +1164,16 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11; rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1; rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3; - numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + - term1i*rr1i + term1k*rr1k + term1ik*rr1ik + - term2i*rr3i + term2k*rr3k + term2ik*rr3ik + + numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + + term1i*rr1i + term1k*rr1k + term1ik*rr1ik + + term2i*rr3i + term2k*rr3k + term2ik*rr3ik + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; // find damped multipole intermediates for force and torque - numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + - term1i*rr3i + term1k*rr3k + term1ik*rr3ik + - term2i*rr5i + term2k*rr5k + term2ik*rr5ik + + numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + + term1i*rr3i + term1k*rr3k + term1ik*rr3ik + + term2i*rr5i + term2k*rr5k + term2ik*rr5ik + term3i*rr7i + term3k*rr7k + term3ik*rr7ik; term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik; term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik; @@ -1187,20 +1187,20 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, // compute the force components for this interaction - numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + + numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + term4*qix + term5*qkx + term6*(qixk+qkxi); - numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + + numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + term4*qiy + term5*qky + term6*(qiyk+qkyi); - numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + + numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + term4*qiz + term5*qkz + term6*(qizk+qkzi); // compute the torque components for this interaction - numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - + numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - term4*qirx - term6*(qikrx+qikx); - numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - + numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - term4*qiry - term6*(qikry+qiky); - numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - + numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - term4*qirz - term6*(qikrz+qikz); // increment force-based gradient and torque on first site @@ -1228,12 +1228,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1408,7 +1408,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, // find the field components for charge penetration damping numtyp dmpi[7],dmpk[7]; dampdir(r,alphai,alphak,dmpi,dmpk); - + numtyp scalek = factor_dscale; numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3; numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5; @@ -1439,7 +1439,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, rr3k*dky + (numtyp)2.0*rr5k*qky; fip[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) - rr3k*dkz + (numtyp)2.0*rr5k*qkz; - + // find terms needed later to compute mutual polarization _fieldp[0] += fid[0]; @@ -1453,7 +1453,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1595,7 +1595,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, } // find terms needed later to compute mutual polarization - // if (poltyp != DIRECT) + // if (poltyp != DIRECT) numtyp dmpik[5]; dampmut(r,alphai,alphak,dmpik); numtyp scalek = factor_wscale; @@ -1610,17 +1610,17 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, tdipdip[3] = -rr3ik + rr5ik*yr*yr; tdipdip[4] = rr5ik*yr*zr; tdipdip[5] = -rr3ik + rr5ik*zr*zr; - //if (i==0 && j == 10) + //if (i==0 && j == 10) // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; - + fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; - + _fieldp[0] += fid[0]; _fieldp[1] += fid[1]; _fieldp[2] += fid[2]; @@ -1632,7 +1632,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, } // iioff2) continue; - + numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; @@ -1905,7 +1905,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, // get the field gradient for direct polarization force - + numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i; numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k; numtyp term1core; @@ -1987,7 +1987,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i; tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k + dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k; - + term2i = rr5i*xr; term1i = zr * term2i; term1core = rr5core*xr*zr; @@ -2039,7 +2039,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp frcx = (numtyp)-2.0 * depx; numtyp frcy = (numtyp)-2.0 * depy; numtyp frcz = (numtyp)-2.0 * depz; - + // get the dEp/dR terms used for direct polarization force // poltyp == MUTUAL && hippo // tixx and tkxx @@ -2108,7 +2108,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, virial[5] += vyz; } } // nbor - + } // ii { UCL_D_Vec coeff_amtype; /// csix = coeff_amclass.x; adisp = coeff_amclass.y; UCL_D_Vec coeff_amclass; - /// Special polar values [0-4]: + /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, /// sp_polar.z = special_polar_piscale /// sp_polar.w = special_mpole UCL_D_Vec sp_polar; - /// Special nonpolar values [0-4]: + /// Special nonpolar values [0-4]: /// sp_nonpolar.x = special_hal /// sp_nonpolar.y special_repel /// sp_nonpolar.z = special_disp @@ -184,7 +184,7 @@ class Hippo : public BaseAmoeba { int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); int polar_real(const int eflag, const int vflag); - + }; } diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 16b697d88f..982cf894a6 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -129,7 +129,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd) { + double *host_q, double *boxlo, double *prd) { return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, sublo, subhi, tag, nspecial, special, nspecial15, special15, @@ -175,7 +175,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full, int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h index 61bfebc17f..0b8f96f69b 100644 --- a/lib/gpu/lal_hippo_extra.h +++ b/lib/gpu/lal_hippo_extra.h @@ -116,46 +116,46 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term; s = (dampi-tmp)*expk + (dampk+tmp)*expi; - ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - - (numtyp)4.0*dmpi2*dmpk2/term) * expk + + ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; - d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - - ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - - (numtyp)4.0*dmpi2*dmpk2/term) * expk + - (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + - ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - + ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + + ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; - d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - - (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - - (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + - (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + - ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + + d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - + (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - + (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + + (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + + ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi; - d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + - dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - - ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - - ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - - (numtyp)4.0*dmpi2*dmpk2/term) * expk + - (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + - dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + - ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + - ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + + d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + + dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - + ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - + ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - + (numtyp)4.0*dmpi2*dmpk2/term) * expk + + (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + + dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + + ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + + ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi; - + if (rorder >= 11) { r6 = r5 * r; dmpi26 = dmpi25 * dmpi2; dmpk26 = dmpk25 * dmpk2; - d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + - dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - - (4.0/945.0)*dmpi2*dmpk26*r5/term - - (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - - (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - - 4.0*dmpi2*dmpk2/term) * expk + - (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + - dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + - (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + - (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + + d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + + dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - + (4.0/945.0)*dmpi2*dmpk26*r5/term - + (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - + (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - + 4.0*dmpi2*dmpk2/term) * expk + + (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + + dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + + (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + + (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi; } } @@ -214,7 +214,7 @@ ucl_inline void damppole(const numtyp r, const int rorder, // compute tolerance and exponential damping factors eps = (numtyp)0.001; - diff = alphai-alphak; + diff = alphai-alphak; if (diff < (numtyp)0) diff = -diff; dampi = alphai * r; dampk = alphak * r; @@ -231,7 +231,7 @@ ucl_inline void damppole(const numtyp r, const int rorder, dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi; dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi; dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi; - dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi; if (diff < eps) { dmpk[0] = dmpi[0]; @@ -248,7 +248,7 @@ ucl_inline void damppole(const numtyp r, const int rorder, dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk; dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk; dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk; - dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk; } @@ -257,21 +257,21 @@ ucl_inline void damppole(const numtyp r, const int rorder, if (diff < eps) { dampi6 = dampi3 * dampi3; dampi7 = dampi3 * dampi4; - dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + + dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + dampi3/(numtyp)48.0)*expi; - dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi; - dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; - dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi; - dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + dampi7/(numtyp)5040.0)*expi; if (rorder >= 11) { dampi8 = dampi4 * dampi4; dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + + dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi; } @@ -282,41 +282,41 @@ ucl_inline void damppole(const numtyp r, const int rorder, termk = alphai2 / (alphai2-alphak2); termi2 = termi * termi; termk2 = termk * termk; - dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - + dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk; dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; - dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - - termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk; - dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + - dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - - termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - + dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - + termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk; - dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - - termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + - (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + - (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + + dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + + (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + + (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk; - + if (rorder >= 11) { dampi6 = dampi3 * dampi3; dampk6 = dampk3 * dampk3; - dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + - (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + - dampi6/(numtyp)1890.0)*expi - - termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + - (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - - (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + - dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - - (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + + dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + + dampi6/(numtyp)1890.0)*expi - + termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + + (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - + (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + + dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - + (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk; } } @@ -404,9 +404,9 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5]) if (diff < eps) { dampi4 = dampi2 * dampi2; dampi5 = dampi2 * dampi3; - dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + + dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi; - dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + + dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi; } else { dampk2 = dampk * dampk; @@ -417,12 +417,12 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5]) termk = alphai2 / (alphai2-alphak2); termi2 = termi * termi; termk2 = termk * termk; - dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - - termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - + dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk; - dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - - termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - - (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - + dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - + termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - + (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk; } } diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index 9e231663c0..9da06cf636 100755 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -91,6 +91,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp action pair_gauss_gpu.h pair_gauss.h action pair_gayberne_gpu.cpp pair_gayberne.cpp action pair_gayberne_gpu.h pair_gayberne.cpp +action pair_hippo_gpu.cpp pair_hippo.cpp +action pair_hippo_gpu.h pair_hippo.cpp action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp action pair_lj96_cut_gpu.h pair_lj96_cut.h action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 91bc679447..e1fe1f1097 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -76,7 +76,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -86,7 +86,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -170,7 +170,7 @@ void PairAmoebaGPU::init_style() maxspecial=atom->maxspecial; maxspecial15=atom->maxspecial15; } - + int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, @@ -207,7 +207,7 @@ void PairAmoebaGPU::multipole_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -239,7 +239,7 @@ void PairAmoebaGPU::multipole_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -281,7 +281,7 @@ void PairAmoebaGPU::induce() // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() - + if (use_ewald) { choose(POLAR_LONG); int nmine = p_kspace->nfft_owned; @@ -317,7 +317,7 @@ void PairAmoebaGPU::induce() memory->create(usump,nlocal,3,"ameoba/induce:usump"); // get the electrostatic field due to permanent multipoles - + dfield0c(field,fieldp); // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only @@ -345,7 +345,7 @@ void PairAmoebaGPU::induce() for (i = 0; i < 10; i++) { printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", i, udir[i][0], udir[i][1], udir[i][2], - udirp[i][0], udirp[i][1], udirp[i][2]); + udirp[i][0], udirp[i][1], udirp[i][2]); } */ // get induced dipoles via the OPT extrapolation method @@ -353,7 +353,7 @@ void PairAmoebaGPU::induce() // uopt,uoptp with a optorder+1 dimension, just optorder ?? // since no need to store optorder+1 values after these loops - if (poltyp == OPT) { + if (poltyp == OPT) { for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uopt[i][0][j] = udir[i][j]; @@ -460,7 +460,7 @@ void PairAmoebaGPU::induce() crstyle = FIELD; comm->reverse_comm_pair(this); } - + //error->all(FLERR,"STOP GPU"); // set initial conjugate gradient residual and conjugate vector @@ -486,7 +486,7 @@ void PairAmoebaGPU::induce() cfstyle = RSD; comm->forward_comm_pair(this); uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -574,7 +574,7 @@ void PairAmoebaGPU::induce() if (pcgprec) { cfstyle = RSD; comm->forward_comm_pair(this); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -629,7 +629,7 @@ void PairAmoebaGPU::induce() if (iter >= politer) done = true; // apply a "peek" iteration to the mutual induced dipoles - + if (done) { for (i = 0; i < nlocal; i++) { term = pcgpeek * poli[i]; @@ -644,7 +644,7 @@ void PairAmoebaGPU::induce() // terminate the calculation if dipoles failed to converge // NOTE: could make this an error - + if (iter >= maxiter || eps > epsold) if (me == 0) error->warning(FLERR,"AMOEBA induced dipoles did not converge"); @@ -652,7 +652,7 @@ void PairAmoebaGPU::induce() // DEBUG output to dump file - if (uind_flag) + if (uind_flag) dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp); // deallocation of arrays @@ -700,7 +700,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) PairAmoeba::udirect2b(field, fieldp); return; } - + int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; int inum, host_start; @@ -753,7 +753,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -764,7 +764,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } - + } /* ---------------------------------------------------------------------- @@ -802,7 +802,7 @@ void PairAmoebaGPU::udirect2b_cpu() firstneigh = list->firstneigh; // NOTE: doesn't this have a problem if aewald is tiny ?? - + aesq2 = 2.0 * aewald * aewald; aesq2n = 0.0; if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); @@ -829,13 +829,13 @@ void PairAmoebaGPU::udirect2b_cpu() pdi = pdamp[itype]; pti = thole[itype]; ddi = dirdamp[itype]; - + // evaluate all sites within the cutoff distance for (jj = 0; jj < jnum; jj++) { jextra = jlist[jj]; j = jextra & NEIGHMASK15; - + xr = x[j][0] - x[i][0]; yr = x[j][1] - x[i][1]; zr = x[j][2] - x[i][2]; @@ -844,7 +844,7 @@ void PairAmoebaGPU::udirect2b_cpu() jtype = amtype[j]; jgroup = amgroup[j]; - + factor_wscale = special_polar_wscale[sbmask15(jextra)]; if (igroup == jgroup) { factor_pscale = special_polar_piscale[sbmask15(jextra)]; @@ -872,7 +872,7 @@ void PairAmoebaGPU::udirect2b_cpu() aefac = aesq2 * aefac; bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; } - + // find terms needed later to compute mutual polarization if (poltyp != DIRECT) { @@ -891,7 +891,7 @@ void PairAmoebaGPU::udirect2b_cpu() scalek = factor_uscale; bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; - + neighptr[n++] = j; tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; tdipdip[ndip++] = bcn[1]*xr*yr; @@ -902,7 +902,7 @@ void PairAmoebaGPU::udirect2b_cpu() } else { if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); } - + } // jj firstneigh_dipole[i] = neighptr; @@ -973,7 +973,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -1001,7 +1001,7 @@ void PairAmoebaGPU::polar_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -1033,7 +1033,7 @@ void PairAmoebaGPU::polar_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1091,11 +1091,11 @@ void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; - vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); - vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); - vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); virial_comp[0] += vxx; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index f4cbf28561..fbc1b6b238 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -88,7 +88,7 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, + double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -185,7 +185,7 @@ void PairHippoGPU::init_style() maxspecial=atom->maxspecial; maxspecial15=atom->maxspecial15; } - + int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, @@ -222,7 +222,7 @@ void PairHippoGPU::dispersion_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -250,7 +250,7 @@ void PairHippoGPU::dispersion_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, off2, atom->q, domain->boxlo, domain->prd); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); } @@ -270,7 +270,7 @@ void PairHippoGPU::multipole_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -302,7 +302,7 @@ void PairHippoGPU::multipole_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -344,7 +344,7 @@ void PairHippoGPU::induce() // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() - + if (use_ewald) { choose(POLAR_LONG); int nmine = p_kspace->nfft_owned; @@ -380,7 +380,7 @@ void PairHippoGPU::induce() memory->create(usump,nlocal,3,"ameoba/induce:usump"); // get the electrostatic field due to permanent multipoles - + dfield0c(field,fieldp); // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only @@ -408,7 +408,7 @@ void PairHippoGPU::induce() for (i = 0; i < 10; i++) { printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", i, udir[i][0], udir[i][1], udir[i][2], - udirp[i][0], udirp[i][1], udirp[i][2]); + udirp[i][0], udirp[i][1], udirp[i][2]); } */ // get induced dipoles via the OPT extrapolation method @@ -416,7 +416,7 @@ void PairHippoGPU::induce() // uopt,uoptp with a optorder+1 dimension, just optorder ?? // since no need to store optorder+1 values after these loops - if (poltyp == OPT) { + if (poltyp == OPT) { for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uopt[i][0][j] = udir[i][j]; @@ -523,7 +523,7 @@ void PairHippoGPU::induce() crstyle = FIELD; comm->reverse_comm_pair(this); } - + //error->all(FLERR,"STOP GPU"); // set initial conjugate gradient residual and conjugate vector @@ -549,7 +549,7 @@ void PairHippoGPU::induce() cfstyle = RSD; comm->forward_comm_pair(this); uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -637,7 +637,7 @@ void PairHippoGPU::induce() if (pcgprec) { cfstyle = RSD; comm->forward_comm_pair(this); - uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); + uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm_pair(this); } @@ -692,7 +692,7 @@ void PairHippoGPU::induce() if (iter >= politer) done = true; // apply a "peek" iteration to the mutual induced dipoles - + if (done) { for (i = 0; i < nlocal; i++) { term = pcgpeek * poli[i]; @@ -707,7 +707,7 @@ void PairHippoGPU::induce() // terminate the calculation if dipoles failed to converge // NOTE: could make this an error - + if (iter >= maxiter || eps > epsold) if (me == 0) error->warning(FLERR,"hippo induced dipoles did not converge"); @@ -715,7 +715,7 @@ void PairHippoGPU::induce() // DEBUG output to dump file - if (uind_flag) + if (uind_flag) dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp); // deallocation of arrays @@ -763,7 +763,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) PairAmoeba::udirect2b(field, fieldp); return; } - + int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; int inum, host_start; @@ -816,7 +816,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -827,7 +827,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } - + } /* ---------------------------------------------------------------------- @@ -865,7 +865,7 @@ void PairHippoGPU::udirect2b_cpu() firstneigh = list->firstneigh; // NOTE: doesn't this have a problem if aewald is tiny ?? - + aesq2 = 2.0 * aewald * aewald; aesq2n = 0.0; if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); @@ -892,13 +892,13 @@ void PairHippoGPU::udirect2b_cpu() pdi = pdamp[itype]; pti = thole[itype]; ddi = dirdamp[itype]; - + // evaluate all sites within the cutoff distance for (jj = 0; jj < jnum; jj++) { jextra = jlist[jj]; j = jextra & NEIGHMASK15; - + xr = x[j][0] - x[i][0]; yr = x[j][1] - x[i][1]; zr = x[j][2] - x[i][2]; @@ -907,7 +907,7 @@ void PairHippoGPU::udirect2b_cpu() jtype = amtype[j]; jgroup = amgroup[j]; - + factor_wscale = special_polar_wscale[sbmask15(jextra)]; if (igroup == jgroup) { factor_pscale = special_polar_piscale[sbmask15(jextra)]; @@ -935,7 +935,7 @@ void PairHippoGPU::udirect2b_cpu() aefac = aesq2 * aefac; bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; } - + // find terms needed later to compute mutual polarization if (poltyp != DIRECT) { @@ -954,7 +954,7 @@ void PairHippoGPU::udirect2b_cpu() scalek = factor_uscale; bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; - + neighptr[n++] = j; tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; tdipdip[ndip++] = bcn[1]*xr*yr; @@ -965,7 +965,7 @@ void PairHippoGPU::udirect2b_cpu() } else { if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); } - + } // jj firstneigh_dipole[i] = neighptr; @@ -1036,7 +1036,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) int idx = 4*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; + field[i][2] += field_ptr[idx+2]; } double* fieldp_ptr = (double *)fieldp_pinned; @@ -1064,7 +1064,7 @@ void PairHippoGPU::polar_real() bool success = true; int *ilist, *numneigh, **firstneigh; - + double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -1096,7 +1096,7 @@ void PairHippoGPU::polar_real() host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -1156,11 +1156,11 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; - vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); - vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); - vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); virial_comp[0] += vxx; From 98a2b6729299574aea7fb29f5f4f6fc3a253dce2 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 28 Sep 2021 17:39:55 -0500 Subject: [PATCH 063/181] Changed to the API of BaseAmoeba to reduce duplicates in hippo --- lib/gpu/lal_amoeba_ext.cpp | 27 +++++---------------------- lib/gpu/lal_base_amoeba.cpp | 18 +++++++++--------- lib/gpu/lal_base_amoeba.h | 10 +++++----- 3 files changed, 19 insertions(+), 36 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 565f16b627..b73f6c4ca6 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -116,24 +116,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas void amoeba_gpu_clear() { AMOEBAMF.clear(); } -/* -int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd) { - return AMOEBAMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, sublo, subhi, - tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, off2, host_q, boxlo, prd); -} -*/ + int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, @@ -145,7 +128,7 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, sublo, subhi, + host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); @@ -163,7 +146,7 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr) { return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); @@ -181,7 +164,7 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full, bool &success, const double aewald, const double off2, double *host_q, double *boxlo, double *prd, void **fieldp_ptr) { return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); @@ -199,7 +182,7 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, + host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 3728fbe85e..7322dde5df 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -341,7 +341,7 @@ template int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -433,7 +433,7 @@ template int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, + int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -474,7 +474,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, int** firstneigh = nullptr; firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, sublo, subhi, tag, + nullptr, nullptr, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, @@ -522,7 +522,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -555,7 +555,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, int** firstneigh = nullptr; firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, sublo, subhi, tag, + host_uind, host_uinp, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, @@ -596,7 +596,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -629,7 +629,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, int** firstneigh = nullptr; firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, sublo, subhi, tag, + host_uind, host_uinp, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, @@ -669,7 +669,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -710,7 +710,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, int** firstneigh = nullptr; firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, sublo, subhi, tag, + host_uind, host_uinp, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index bd30fc3fbb..accb9a5900 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -134,7 +134,7 @@ class BaseAmoeba { virtual int** precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, double *sublo, double *subhi, + double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, const bool eflag, const bool vflag, @@ -145,7 +145,7 @@ class BaseAmoeba { /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, const bool eflag, const bool vflag, @@ -158,7 +158,7 @@ class BaseAmoeba { virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -172,7 +172,7 @@ class BaseAmoeba { virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, + double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -186,7 +186,7 @@ class BaseAmoeba { virtual int** compute_polar_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, double *sublo, double *subhi, + double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, const bool eflag, const bool vflag, From 6286a119b354f31238bb0026fc44440fda7d6335 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 28 Sep 2021 23:12:07 -0500 Subject: [PATCH 064/181] Removed precompute() in hippo --- lib/gpu/lal_base_amoeba.cpp | 4 +- lib/gpu/lal_hippo.cpp | 130 +++++------------------------------- lib/gpu/lal_hippo.cu | 36 +++++----- lib/gpu/lal_hippo.h | 12 ---- 4 files changed, 38 insertions(+), 144 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 7322dde5df..8b002c27e6 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -399,12 +399,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall if (!success) return nullptr; atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); hd_balancer.start_timer(); } else { atom->cast_x_data(host_x,host_type); atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index ac221f8376..5a6ac20633 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -154,120 +154,24 @@ double HippoT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Hippo); } -// --------------------------------------------------------------------------- -// Prepare for multiple kernel calls in a time step: -// - reallocate per-atom arrays, if needed -// - transfer extra data from host to device -// - build the full neighbor lists for use by different kernels -// --------------------------------------------------------------------------- - -template -int** HippoT::precompute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **&ilist, int **&jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd) { - this->acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - this->set_kernel(eflag,vflag); - - // ------------------- Resize 1-5 neighbor arrays ------------------------ - - if (nall>this->_nmax) { - this->_nmax = nall; - this->dev_nspecial15.clear(); - this->dev_special15.clear(); - this->dev_special15_t.clear(); - this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); - this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); - } - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return nullptr; - } - - this->hd_balancer.balance(cpu_time); - int inum=this->hd_balancer.get_gpu_count(ago,inum_full); - this->ans->inum(inum); - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - success); - if (!success) - return nullptr; - this->atom->cast_q_data(host_q); - this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); - this->hd_balancer.start_timer(); - } else { - this->atom->cast_x_data(host_x,host_type); - this->atom->cast_q_data(host_q); - this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - } - this->atom->add_q_data(); - this->atom->add_extra_data(); - - *ilist=this->nbor->host_ilist.begin(); - *jnum=this->nbor->host_acc.begin(); - - this->device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, - boxlo, prd); - - // re-allocate dev_short_nbor if necessary - if (inum_full*(2+this->_max_nbors) > this->dev_short_nbor.cols()) { - int _nmax=static_cast(static_cast(inum_full)*1.10); - this->dev_short_nbor.resize((2+this->_max_nbors)*this->_nmax); - } - - return this->nbor->host_jlist.begin()-host_start; -} - // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute dispersion real-space // --------------------------------------------------------------------------- template int** HippoT::compute_dispersion_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_disp, - double *host_q, double *boxlo, double *prd) { + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_disp, + double *host_q, double *boxlo, double *prd) { this->acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -296,7 +200,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, nullptr, nullptr, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, @@ -407,7 +311,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, nullptr, nullptr, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, @@ -530,7 +434,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, // and build the neighbor lists if needed int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, @@ -645,7 +549,7 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, // and build the neighbor lists if needed int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, @@ -766,7 +670,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index b282586efb..2e62d0703e 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -410,21 +410,21 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict coeff, - const __global numtyp4 *restrict sp_nonpolar, - const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global acctyp4 *restrict ans, - __global acctyp *restrict engv, - __global acctyp4 *restrict tep, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, - const int t_per_atom, const numtyp aewald, - const numtyp off2, const numtyp cut2, - const numtyp c0, const numtyp c1, const numtyp c2, - const numtyp c3, const numtyp c4, const numtyp c5) + const __global numtyp *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict sp_nonpolar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global acctyp4 *restrict tep, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + const int t_per_atom, const numtyp aewald, + const numtyp off2, const numtyp cut2, + const numtyp c0, const numtyp c1, const numtyp c2, + const numtyp c3, const numtyp c4, const numtyp c5) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -895,9 +895,11 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, } // ii { const double gpu_split, FILE *_screen, const double polar_dscale, const double polar_uscale); - /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed - int** precompute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, double* host_pval, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **&ilist, int **&numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd); - /// Compute dispersion real-space with device neighboring int** compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, From b95508125b36cc004c62385ad0391bf25fd7c01d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 28 Sep 2021 23:24:34 -0500 Subject: [PATCH 065/181] Adding the repulsion kernel for hippo --- lib/gpu/lal_hippo.cpp | 113 +++++++++++++++++++++++++++++++++++++++++- lib/gpu/lal_hippo.h | 18 ++++++- 2 files changed, 129 insertions(+), 2 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 5a6ac20633..9a45ea6fc8 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -36,7 +36,9 @@ HippoT::Hippo() : BaseAmoeba(), template HippoT::~Hippo() { clear(); + k_repulsion.clear(); k_dispersion.clear(); + } template @@ -71,6 +73,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, return success; // specific to HIPPO + k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion"); k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion"); // If atom type constants fit in shared memory use fast kernel @@ -154,10 +157,118 @@ double HippoT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Hippo); } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute repulsion +// --------------------------------------------------------------------------- +template +int** HippoT::compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_repulse, + double *host_q, double *boxlo, double *prd) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + this->_off2_repulse = off2_repulse; + this->_aewald = aewald; + const int red_blocks=repulsion(eflag,vflag); + + // only copy them back if this is the last kernel + // otherwise, commenting out these two lines to leave the answers + // (forces, energies and virial) on the device until the last kernel + //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //this->device->add_ans_object(this->ans); + + this->hd_balancer.stop_timer(); + + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Calculate the repulsion term, returning tep +// --------------------------------------------------------------------------- +template +int HippoT::repulsion(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_disp, + // at this point mpole is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_repulse, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + k_repulsion.set_size(GX,BX); + k_repulsion.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_repulse); + this->time_pair.stop(); + + return GX; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute dispersion real-space // --------------------------------------------------------------------------- - template int** HippoT::compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index 78f85db7df..17e3a1b03f 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -54,6 +54,21 @@ class Hippo : public BaseAmoeba { const double gpu_split, FILE *_screen, const double polar_dscale, const double polar_uscale); + /// Compute repulsion with device neighboring + int** compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_repulse, + double *host_q, double *boxlo, double *prd); + /// Compute dispersion real-space with device neighboring int** compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -163,10 +178,11 @@ class Hippo : public BaseAmoeba { numtyp _polar_dscale, _polar_uscale; numtyp _qqrd2e; - UCL_Kernel k_dispersion; + UCL_Kernel k_repulsion, k_dispersion; protected: bool _allocated; + int repulsion(const int eflag, const int vflag); int dispersion_real(const int eflag, const int vflag); int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); From 17edd797a7a57ee227ab44cba1861f4bbbdec798 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 28 Sep 2021 23:42:04 -0500 Subject: [PATCH 066/181] Adding API for the repulsion term to hippo/gpu --- lib/gpu/lal_hippo.cpp | 16 +++++++-- lib/gpu/lal_hippo.h | 2 +- lib/gpu/lal_hippo_ext.cpp | 17 +++++++++ src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/pair_hippo_gpu.cpp | 70 ++++++++++++++++++++++++++++++++++++++ src/GPU/pair_hippo_gpu.h | 1 + 6 files changed, 104 insertions(+), 4 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 9a45ea6fc8..80762b55aa 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -173,7 +173,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double off2_repulse, - double *host_q, double *boxlo, double *prd) { + double *host_q, double *boxlo, double *prd, void **tep_ptr) { this->acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -210,6 +210,14 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); + // ------------------- Resize _tep array ------------------------ + + if (inum_full>this->_max_tep_size) { + this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); + this->_tep.resize(this->_max_tep_size*4); + } + *tep_ptr=this->_tep.host.begin(); + this->_off2_repulse = off2_repulse; this->_aewald = aewald; const int red_blocks=repulsion(eflag,vflag); @@ -222,6 +230,10 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, this->hd_balancer.stop_timer(); + // copy tep from device to host + + this->_tep.update_host(this->_max_tep_size*4,false); + return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -257,7 +269,7 @@ int HippoT::repulsion(const int eflag, const int vflag) { &coeff_amtype, &coeff_amclass, &sp_nonpolar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, + &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &this->_aewald, &this->_off2_repulse); diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index 17e3a1b03f..374ca5d836 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -67,7 +67,7 @@ class Hippo : public BaseAmoeba { int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double off2_repulse, - double *host_q, double *boxlo, double *prd); + double *host_q, double *boxlo, double *prd, void** tep_ptr); /// Compute dispersion real-space with device neighboring int** compute_dispersion_real(const int ago, const int inum_full, const int nall, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 982cf894a6..2f1a800589 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -120,6 +120,23 @@ void hippo_gpu_clear() { HIPPOMF.clear(); } +int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr) { + return HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd, tep_ptr); +} + int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 8a2f09d443..5ba7aae981 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -343,7 +343,7 @@ class PairAmoeba : public Pair { void hal(); - void repulsion(); + virtual void repulsion(); void damprep(double, double, double, double, double, double, double, double, int, double, double, double *); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index fbc1b6b238..4852f75e08 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -66,6 +66,17 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double polar_dscale, const double polar_uscale, int& tq_size); void hippo_gpu_clear(); +int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, void **tep_ptr); + int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, @@ -209,6 +220,65 @@ void PairHippoGPU::init_style() /* ---------------------------------------------------------------------- */ +void PairHippoGPU::repulsion() +{ + if (!gpu_repulsion_ready) { + PairAmoeba::repulsion(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + choose(REPULSE); + + // set the energy unit conversion factor for multipolar real-space calculation + + firstneigh = hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // reference to the tep array from GPU lib + + if (tq_single) { + float *tq_ptr = (float *)tq_pinned; + compute_force_from_torque(tq_ptr, frepulse, virrepulse); + } else { + double *tq_ptr = (double *)tq_pinned; + compute_force_from_torque(tq_ptr, frepulse, virrepulse); + } +} + +/* ---------------------------------------------------------------------- */ + void PairHippoGPU::dispersion_real() { if (!gpu_dispersion_real_ready) { diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index 9e961045eb..c7a4e75ebe 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -35,6 +35,7 @@ class PairHippoGPU : public PairAmoeba { virtual void induce(); + virtual void repulsion(); virtual void dispersion_real(); virtual void multipole_real(); virtual void udirect2b(double **, double **); From 4be44c386f2408cb68f7816d1a80544ba6a73b59 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 29 Sep 2021 09:40:33 -0500 Subject: [PATCH 067/181] Added necessary arguments to the hippo repulsion kernel --- lib/gpu/lal_hippo.cpp | 30 +++++++++++++++++++++++++----- lib/gpu/lal_hippo.cu | 14 +++++++------- lib/gpu/lal_hippo.h | 30 ++++++++++++++++++------------ lib/gpu/lal_hippo_ext.cpp | 10 ++++++++-- src/GPU/pair_hippo_gpu.cpp | 11 ++++++++--- 5 files changed, 66 insertions(+), 29 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 80762b55aa..6830847e98 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -57,6 +57,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, const double *host_csix, const double *host_adisp, const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, @@ -99,6 +100,16 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); ucl_copy(coeff_amtype,host_write,false); + for (int i = 0; i < max_amtype; i++) { + host_write[i].x = host_sizpr[i]; + host_write[i].y = host_dmppr[i]; + host_write[i].z = host_elepr[i]; + host_write[i].w = (numtyp)0; + } + + coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_rep,host_write,false); + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < max_amclass; i++) { host_write2[i].x = host_csix[i]; @@ -133,7 +144,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, _polar_uscale = polar_uscale; _allocated=true; - this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() + this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + coeff_amclass.row_bytes() + + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes(); return 0; } @@ -145,6 +156,7 @@ void HippoT::clear() { _allocated=false; coeff_amtype.clear(); + coeff_rep.clear(); coeff_amclass.clear(); sp_polar.clear(); sp_nonpolar.clear(); @@ -173,7 +185,9 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double off2_repulse, - double *host_q, double *boxlo, double *prd, void **tep_ptr) { + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { this->acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -219,7 +233,13 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, *tep_ptr=this->_tep.host.begin(); this->_off2_repulse = off2_repulse; - this->_aewald = aewald; + _cut2 = cut2; + _c0 = c0; + _c1 = c1; + _c2 = c2; + _c3 = c3; + _c4 = c4; + _c5 = c5; const int red_blocks=repulsion(eflag,vflag); // only copy them back if this is the last kernel @@ -266,13 +286,13 @@ int HippoT::repulsion(const int eflag, const int vflag) { k_repulsion.set_size(GX,BX); k_repulsion.run(&this->atom->x, &this->atom->extra, - &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &coeff_rep, &sp_nonpolar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &this->_aewald, - &this->_off2_repulse); + &this->_off2_repulse, &_cut2, &_c0, &_c1, &_c2, &_c3, &_c4, &_c5); this->time_pair.stop(); return GX; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 2e62d0703e..1b6344a163 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -411,7 +411,7 @@ _texture( q_tex,int2); __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, - const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict coeff_rep, const __global numtyp4 *restrict sp_nonpolar, const __global int *dev_nbor, const __global int *dev_packed, @@ -480,9 +480,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp qiyz = pol3i.x; // rpole[i][9]; numtyp qizz = pol3i.y; // rpole[i][12]; int itype = pol3i.z; // amtype[i]; - numtyp sizi = coeff[itype].x; // sizpr[itype]; - numtyp dmpi = coeff[itype].y; // dmppr[itype]; - numtyp vali = coeff[itype].z; // elepr[itype]; + numtyp sizi = coeff_rep[itype].x; // sizpr[itype]; + numtyp dmpi = coeff_rep[itype].y; // dmppr[itype]; + numtyp vali = coeff_rep[itype].z; // elepr[itype]; for ( ; nbor { const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, const double *host_csix, const double *host_adisp, const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, @@ -56,18 +57,20 @@ class Hippo : public BaseAmoeba { /// Compute repulsion with device neighboring int** compute_repulsion(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_repulse, - double *host_q, double *boxlo, double *prd, void** tep_ptr); + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_repulse, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5,void** tep_ptr); /// Compute dispersion real-space with device neighboring int** compute_dispersion_real(const int ago, const int inum_full, const int nall, @@ -157,6 +160,8 @@ class Hippo : public BaseAmoeba { UCL_D_Vec coeff_amtype; /// csix = coeff_amclass.x; adisp = coeff_amclass.y; UCL_D_Vec coeff_amclass; + /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z; + UCL_D_Vec coeff_rep; /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, @@ -175,6 +180,7 @@ class Hippo : public BaseAmoeba { /// Number of atom types int _lj_types; + numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5; numtyp _polar_dscale, _polar_uscale; numtyp _qqrd2e; diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 2f1a800589..15cb53cdb1 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -37,6 +37,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, const double *host_csix, const double *host_adisp, const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, @@ -74,6 +75,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, host_csix, host_adisp, host_pcore, host_palpha, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, @@ -99,6 +101,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, host_csix, host_adisp, host_pcore, host_palpha, nlocal, nall, max_nbors, maxspecial, maxspecial15, cell_size, gpu_split, @@ -129,12 +132,15 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd, void **tep_ptr) { + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { return HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, off2, host_q, boxlo, prd, tep_ptr); + cpu_time, success, aewald, off2, host_q, boxlo, prd, + cut2, c0, c1, c2, c3, c4, c5, tep_ptr); } int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 4852f75e08..d6a16c72fb 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -58,6 +58,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, const double *host_csix, const double *host_adisp, const double *host_pcore, const double *host_palpha, const int nlocal, const int nall, const int max_nbors, @@ -75,7 +76,9 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd, void **tep_ptr); + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr); int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, @@ -203,7 +206,8 @@ void PairHippoGPU::init_style() pdamp, thole, dirdamp, amtype2class, special_hal, special_repel, special_disp, special_mpole, special_polar_wscale, special_polar_piscale, - special_polar_pscale, csix, adisp, pcore, palpha, + special_polar_pscale, sizpr, dmppr, elepr, + csix, adisp, pcore, palpha, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, polar_dscale, polar_uscale, tq_size); @@ -261,7 +265,8 @@ void PairHippoGPU::repulsion() eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, aewald, off2, atom->q, - domain->boxlo, domain->prd, &tq_pinned); + domain->boxlo, domain->prd, cut2, + c0, c1, c2, c3, c4, c5, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); From 01381b7f54ac6d01f48444ec00997cacff775dbe Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 29 Sep 2021 11:57:25 -0500 Subject: [PATCH 068/181] Fixed bugs in the repulsion kernel, now working correctly with the double precision mode --- lib/gpu/lal_base_amoeba.cpp | 2 +- lib/gpu/lal_hippo.cu | 22 +++++++++++++--------- lib/gpu/lal_hippo_extra.h | 30 +++++++++++++++--------------- src/GPU/pair_hippo_gpu.cpp | 2 +- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 8b002c27e6..5d1b7016da 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -78,7 +78,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, bool rot = false; bool vel = false; _extra_fields = 24; // round up to accomodate quadruples of numtyp values - // rpole 13; uind 3; uinp 3; amtype, amgroup + // rpole 13; uind 3; uinp 3; amtype, amgroup; pval int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields); if (success!=0) return success; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 1b6344a163..bf63652a47 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -493,9 +493,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, //int jtype=jx.w; // Compute r12 - numtyp xr = ix.x - jx.x; - numtyp yr = ix.y - jx.y; - numtyp zr = ix.z - jx.z; + numtyp xr = jx.x - ix.x; + numtyp yr = jx.y - ix.y; + numtyp zr = jx.z - ix.z; numtyp r2 = xr*xr + yr*yr + zr*zr; if (r2>off2) continue; @@ -521,6 +521,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)]; numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)]; + if (factor_repel == (numtyp)0) continue; // intermediates involving moments and separation distance @@ -614,7 +615,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp term1 = vali*valk; numtyp term2 = valk*dir - vali*dkr + dik; numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); - numtyp term4 = dir*qkr - dkr*qir - 4.0*qik; + numtyp term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik; numtyp term5 = qir*qkr; numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8]; @@ -646,6 +647,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, frcx = frcx*rr1 + eterm*rr3*xr; frcy = frcy*rr1 + eterm*rr3*yr; frcz = frcz*rr1 + eterm*rr3*zr; + frcx = sizik * frcx; + frcy = sizik * frcy; + frcz = sizik * frcz; // compute the torque components for this interaction @@ -666,7 +670,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp r4 = r2 * r2; numtyp r5 = r2 * r3; numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0; - numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 + + numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp)4.0*c4*r3 + (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1; dtaper *= e * rr1; e *= taper; @@ -896,10 +900,10 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, } // ii Date: Wed, 29 Sep 2021 12:32:08 -0500 Subject: [PATCH 069/181] Fixed bugs with damprep where ucl_powr in mixed precision failed with a negative single-reprecision base --- lib/gpu/lal_hippo.cpp | 3 ++- lib/gpu/lal_hippo.cu | 5 +++-- lib/gpu/lal_hippo_extra.h | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 6830847e98..d1b61d5415 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -292,7 +292,8 @@ int HippoT::repulsion(const int eflag, const int vflag) { &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &this->_aewald, - &this->_off2_repulse, &_cut2, &_c0, &_c1, &_c2, &_c3, &_c4, &_c5); + &this->_off2_repulse, &_cut2, + &_c0, &_c1, &_c2, &_c3, &_c4, &_c5); this->time_pair.stop(); return GX; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index bf63652a47..fae6cf1681 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -644,13 +644,14 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, term4*qiy + term5*qky + term6*(qiyk+qkyi); numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + term4*qiz + term5*qkz + term6*(qizk+qkzi); + frcx = frcx*rr1 + eterm*rr3*xr; frcy = frcy*rr1 + eterm*rr3*yr; frcz = frcz*rr1 + eterm*rr3*zr; frcx = sizik * frcx; frcy = sizik * frcy; frcz = sizik * frcz; - + // compute the torque components for this interaction numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - @@ -903,7 +904,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, // offset,eflag,vflag,ans,engv); store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, - offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); + offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } /* ---------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h index 2afcc963ec..ac02e2e9e8 100644 --- a/lib/gpu/lal_hippo_extra.h +++ b/lib/gpu/lal_hippo_extra.h @@ -112,7 +112,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, dmpk24 = dmpk23 * dmpk2; dmpk25 = dmpk24 * dmpk2; term = dmpi22 - dmpk22; - pre = (numtyp)8192.0 * dmpi23 * dmpk23 / ucl_powr(term,(numtyp)4.0); + pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0); tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term; s = (dampi-tmp)*expk + (dampk+tmp)*expi; @@ -173,6 +173,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, dmpik[4] = pre * (s*d2s + ds*ds); dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s); dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s); + if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s); } From e0f91b96fe064a93bd478c250a768aa9eee70ff5 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 29 Sep 2021 13:07:20 -0500 Subject: [PATCH 070/181] Cleaned up and added necessary comments --- src/GPU/pair_amoeba_gpu.cpp | 12 +++++++-- src/GPU/pair_hippo_gpu.cpp | 51 +++++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index e1fe1f1097..65a4af7d64 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -192,7 +192,10 @@ void PairAmoebaGPU::init_style() tq_single = true; } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + multipole_real = real-space portion of mulipole interactions + adapted from Tinker emreal1d() routine +------------------------------------------------------------------------- */ void PairAmoebaGPU::multipole_real() { @@ -257,6 +260,8 @@ void PairAmoebaGPU::multipole_real() /* ---------------------------------------------------------------------- induce = induced dipole moments via pre-conditioned CG solver adapted from Tinker induce0a() routine + NOTE: Almost the same in the CPU version, except that there is no need + to call reverse_comm() for crstyle = FIELD; ------------------------------------------------------------------------- */ void PairAmoebaGPU::induce() @@ -986,7 +991,10 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) } } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ void PairAmoebaGPU::polar_real() { diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 533abef4d9..014b14471e 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -146,9 +146,9 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) fieldp_pinned = nullptr; tq_pinned = nullptr; - gpu_hal_ready = false; // always false for HIPPO - gpu_repulsion_ready = true; // true for HIPPO when ready - gpu_dispersion_real_ready = true; // true for HIPPO when ready + gpu_hal_ready = false; // always false for HIPPO + gpu_repulsion_ready = true; + gpu_dispersion_real_ready = true; gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; @@ -222,7 +222,10 @@ void PairHippoGPU::init_style() tq_single = true; } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + repulsion = Pauli repulsion interactions + adapted from Tinker erepel1b() routine +------------------------------------------------------------------------- */ void PairHippoGPU::repulsion() { @@ -258,15 +261,15 @@ void PairHippoGPU::repulsion() // set the energy unit conversion factor for multipolar real-space calculation firstneigh = hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, - domain->boxlo, domain->prd, cut2, - c0, c1, c2, c3, c4, c5, &tq_pinned); + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, cut2, + c0, c1, c2, c3, c4, c5, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -282,7 +285,10 @@ void PairHippoGPU::repulsion() } } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + dispersion_real = real-space portion of Ewald dispersion + adapted from Tinker edreal1d() routine +------------------------------------------------------------------------- */ void PairHippoGPU::dispersion_real() { @@ -330,7 +336,10 @@ void PairHippoGPU::dispersion_real() error->one(FLERR,"Insufficient memory on accelerator"); } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + multipole_real = real-space portion of mulipole interactions + adapted from Tinker emreal1d() routine +------------------------------------------------------------------------- */ void PairHippoGPU::multipole_real() { @@ -395,6 +404,8 @@ void PairHippoGPU::multipole_real() /* ---------------------------------------------------------------------- induce = induced dipole moments via pre-conditioned CG solver adapted from Tinker induce0a() routine + NOTE: Almost the same in the CPU version, except that there is no need + to call reverse_comm() for crstyle = FIELD; ------------------------------------------------------------------------- */ void PairHippoGPU::induce() @@ -879,6 +890,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() + // so no need to call ubdirect2b_cpu(). // udirect2b_cpu(); // accumulate the field and fieldp values from the GPU lib @@ -1124,7 +1136,10 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) } } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ void PairHippoGPU::polar_real() { @@ -1187,7 +1202,7 @@ void PairHippoGPU::polar_real() } /* ---------------------------------------------------------------------- - compute atom forces from torques + compute atom forces from torques used by various terms ------------------------------------------------------------------------- */ template @@ -1212,8 +1227,6 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, _tq[2] = tq_ptr[4*i+2]; torque2force(i,_tq,fix,fiy,fiz,force_comp); - //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, _tq[0], _tq[1], _tq[2]); - iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; From 3328ac0df2790aa2c4d16f8088d8450061d60c05 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 1 Oct 2021 09:58:21 -0500 Subject: [PATCH 071/181] Attempted to remove some redundancy in data transfers in the amoeba kernels; keeping HIPPO independent of AMOEBA for now --- lib/gpu/lal_amoeba.cpp | 6 +++--- lib/gpu/lal_amoeba.cu | 5 +++-- lib/gpu/lal_base_amoeba.cpp | 36 ++++++++++++++++++++++-------------- lib/gpu/lal_base_amoeba.h | 8 ++++---- lib/gpu/lal_hippo.cpp | 6 +++--- lib/gpu/lal_hippo.cu | 2 +- src/MAKE/Makefile.mpi | 4 ++-- 7 files changed, 38 insertions(+), 29 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 917166c423..b92e1bfd55 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -62,9 +62,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, - "k_amoeba_multipole", - "k_amoeba_udirect2b", "k_amoeba_umutual2b", - "k_amoeba_polar", "k_amoeba_short_nbor"); + "k_amoeba_multipole", "k_amoeba_udirect2b", + "k_amoeba_umutual2b", "k_amoeba_polar", + "k_amoeba_short_nbor", "k_amoeba_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index fdb959f3e2..befefa8dd0 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1637,12 +1637,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, else do nothing to IJ entry ------------------------------------------------------------------------- */ -__kernel void k_special15(__global int * dev_nbor, +__kernel void k_amoeba_special15(__global int * dev_nbor, const __global int * dev_packed, const __global tagint *restrict tag, const __global int *restrict nspecial15, const __global tagint *restrict special15, - const int inum, const int nall, const int nbor_pitch, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset, n_stride, i; atom_info(t_per_atom,ii,tid,offset); diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 5d1b7016da..bb5eb2d53b 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -58,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const char *k_name_udirect2b, const char *k_name_umutual2b, const char *k_name_polar, - const char *k_name_short_nbor) { + const char *k_name_short_nbor, + const char* k_name_special15) { screen=_screen; int gpu_nbor=0; @@ -91,7 +92,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program,k_name_multipole, - k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor); + k_name_udirect2b, k_name_umutual2b,k_name_polar, + k_name_short_nbor, k_name_special15); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -399,24 +401,22 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall if (!success) return nullptr; atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); hd_balancer.start_timer(); } else { atom->cast_x_data(host_x,host_type); atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } atom->add_q_data(); + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); *ilist=nbor->host_ilist.begin(); *jnum=nbor->host_acc.begin(); - device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, - boxlo, prd); - // re-allocate dev_short_nbor if necessary if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) { int _nmax=static_cast(static_cast(inum_full)*1.10); @@ -463,13 +463,8 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once // in the first kernel in a time step. - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; firstneigh = precompute(ago, inum_full, nall, host_x, host_type, @@ -553,6 +548,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, // and build the neighbor lists if needed int** firstneigh = nullptr; +/* firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, sublo, subhi, tag, @@ -560,6 +556,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); +*/ + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -627,6 +626,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, // and build the neighbor lists if needed int** firstneigh = nullptr; +/* firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, sublo, subhi, tag, @@ -634,6 +634,9 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); +*/ + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -708,6 +711,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; +/* firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, sublo, subhi, tag, @@ -715,6 +719,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); +*/ + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + atom->add_extra_data(); // ------------------- Resize _tep array ------------------------ @@ -829,7 +836,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, - const char *kname_short_nbor) { + const char *kname_short_nbor, + const char* kname_special15) { if (_compiled) return; @@ -843,7 +851,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_umutual2b.set_function(*pair_program,kname_umutual2b); k_polar.set_function(*pair_program,kname_polar); k_short_nbor.set_function(*pair_program,kname_short_nbor); - k_special15.set_function(*pair_program,"k_special15"); + k_special15.set_function(*pair_program,kname_special15); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index accb9a5900..6b11e25786 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -56,7 +56,7 @@ class BaseAmoeba { const double gpu_split, FILE *screen, const void *pair_program, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, - const char *kname_polar, const char *kname_short_nbor); + const char *kname_polar, const char *kname_short_nbor, const char* kname_special15); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -279,9 +279,9 @@ class BaseAmoeba { numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *kname_multipole, - const char *kname_udirect2b, const char *kname_umutual2b, - const char *kname_polar, const char *kname_short_nbor); + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_short_nbor, const char* kname_special15); virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index d1b61d5415..9a86be8f42 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -67,9 +67,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,hippo, - "k_hippo_multipole", - "k_hippo_udirect2b", "k_hippo_umutual2b", - "k_hippo_polar", "k_hippo_short_nbor"); + "k_hippo_multipole", "k_hippo_udirect2b", + "k_hippo_umutual2b", "k_hippo_polar", + "k_hippo_short_nbor", "k_hippo_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index fae6cf1681..cb11bd4022 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -2135,7 +2135,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, else do nothing to IJ entry ------------------------------------------------------------------------- */ -__kernel void k_special15(__global int * dev_nbor, +__kernel void k_hippo_special15(__global int * dev_nbor, const __global int * dev_packed, const __global tagint *restrict tag, const __global int *restrict nspecial15, diff --git a/src/MAKE/Makefile.mpi b/src/MAKE/Makefile.mpi index 9776b0153e..e95d80d137 100644 --- a/src/MAKE/Makefile.mpi +++ b/src/MAKE/Makefile.mpi @@ -7,12 +7,12 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler CC = mpicxx -CCFLAGS = -g -O3 +CCFLAGS = -g -O3 -fopenmp SHFLAGS = -fPIC DEPFLAGS = -M LINK = mpicxx -LINKFLAGS = -g -O3 +LINKFLAGS = -g -O3 -fopenmp LIB = SIZE = size From f126f785a4a7f013d14264c024e100338b3971f8 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 1 Oct 2021 10:19:17 -0500 Subject: [PATCH 072/181] Removed duplicates in the amoeba kernels --- lib/gpu/lal_base_amoeba.cpp | 40 ++++++++++++++++++++----------------- lib/gpu/lal_base_amoeba.h | 2 ++ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index bb5eb2d53b..d0631442e0 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -353,20 +353,20 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall bool &success, double *host_q, double *boxlo, double *prd) { acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; + //int eflag, vflag; + if (eatom) _eflag=2; + else if (eflag_in) _eflag=1; + else _eflag=0; + if (vatom) _vflag=2; + else if (vflag_in) _vflag=1; + else _vflag=0; #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; + if (_eflag) _eflag=2; + if (_vflag) _vflag=2; #endif - set_kernel(eflag,vflag); + set_kernel(_eflag,_vflag); // ------------------- Resize 1-5 neighbor arrays ------------------------ @@ -444,6 +444,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { +/* acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -459,7 +460,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, #endif set_kernel(eflag,vflag); - +*/ // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed // NOTE: @@ -486,7 +487,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, _off2_mpole = off2_mpole; _felec = felec; _aewald = aewald; - const int red_blocks=multipole_real(eflag,vflag); + const int red_blocks=multipole_real(_eflag,_vflag); // leave the answers (forces, energies and virial) on the device, // only copy them back in the last kernel (polar_real) @@ -528,6 +529,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { +/* acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -543,7 +545,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, #endif set_kernel(eflag,vflag); - +*/ // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed @@ -570,7 +572,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, _off2_polar = off2_polar; _aewald = aewald; - const int red_blocks=udirect2b(eflag,vflag); + const int red_blocks=udirect2b(_eflag,_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -606,6 +608,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { +/* acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -621,7 +624,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, #endif set_kernel(eflag,vflag); - +*/ // reallocate per-atom arrays, transfer extra data from the host // and build the neighbor lists if needed @@ -648,7 +651,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, _off2_polar = off2_polar; _aewald = aewald; - const int red_blocks=umutual2b(eflag,vflag); + const int red_blocks=umutual2b(_eflag,_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -683,6 +686,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_polar, double *host_q, double *boxlo, double *prd, void **tep_ptr) { +/* acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -698,7 +702,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, #endif set_kernel(eflag,vflag); - +*/ // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed // NOTE: @@ -734,7 +738,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, _off2_polar = off2_polar; _felec = felec; _aewald = aewald; - const int red_blocks=polar_real(eflag,vflag); + const int red_blocks=polar_real(_eflag,_vflag); // only copy answers (forces, energies and virial) back from the device // in the last kernel (which is polar_real here) diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 6b11e25786..cb040c630d 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -278,6 +278,8 @@ class BaseAmoeba { numtyp _aewald,_felec; numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar; + int _eflag, _vflag; + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, From f4d3d3a2b591ac0dee4d982506ae8e880394a922 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 2 Oct 2021 00:09:53 -0500 Subject: [PATCH 073/181] Gradually cleaned up and removed redundancy in amoeba and hippo --- lib/gpu/lal_base_amoeba.cpp | 156 +++----------------------------- lib/gpu/lal_base_amoeba.h | 20 ++-- lib/gpu/lal_hippo.cpp | 176 +++++++----------------------------- lib/gpu/lal_hippo.cu | 4 +- lib/gpu/lal_hippo.h | 1 - lib/gpu/lal_hippo_ext.cpp | 7 +- src/GPU/pair_hippo_gpu.cpp | 6 +- 7 files changed, 61 insertions(+), 309 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index d0631442e0..7cd410b6b8 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -245,8 +245,8 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, // --------------------------------------------------------------------------- template void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, int *ilist, int *numj, int **firstneigh, const bool eflag_in, const bool vflag_in, @@ -353,7 +353,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall bool &success, double *host_q, double *boxlo, double *prd) { acc_timers(); - //int eflag, vflag; if (eatom) _eflag=2; else if (eflag_in) _eflag=1; else _eflag=0; @@ -401,12 +400,10 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall if (!success) return nullptr; atom->cast_q_data(host_q); - //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); hd_balancer.start_timer(); } else { atom->cast_x_data(host_x,host_type); atom->cast_q_data(host_q); - //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } @@ -444,23 +441,6 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { -/* - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); -*/ // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed // NOTE: @@ -499,13 +479,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // copy tep from device to host _tep.update_host(_max_tep_size*4,false); -/* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_tep[4*i]); - printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -529,36 +503,11 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { -/* - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); -*/ // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed int** firstneigh = nullptr; -/* - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); -*/ + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); @@ -577,14 +526,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, // copy field and fieldp from device to host (_fieldp store both arrays, one after another) _fieldp.update_host(_max_fieldp_size*8,false); -/* - printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", - this->_fieldp.cols(), _max_fieldp_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); - printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; } @@ -608,36 +550,11 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { -/* - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); -*/ // reallocate per-atom arrays, transfer extra data from the host // and build the neighbor lists if needed int** firstneigh = nullptr; -/* - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); -*/ + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); @@ -656,14 +573,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, // copy field and fieldp from device to host (_fieldp store both arrays, one after another) _fieldp.update_host(_max_fieldp_size*8,false); -/* - printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", - this->_fieldp.cols(), _max_fieldp_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); - printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; } @@ -686,44 +596,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_polar, double *host_q, double *boxlo, double *prd, void **tep_ptr) { -/* - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); -*/ - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; -/* - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); -*/ + cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); @@ -750,13 +625,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, // copy tep from device to host _tep.update_host(_max_tep_size*4,false); -/* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_tep[4*i]); - printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -826,7 +695,6 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, n += nstride*_nall; if (pval) { - for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; pextra[idx] = pval[i]; @@ -889,9 +757,9 @@ int BaseAmoebaT::add_onefive_neighbors() { k_special15.set_size(GX,BX); k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(), - &atom->dev_tag, &dev_nspecial15, &dev_special15, - &ainum, &_nall, &nbor_pitch, - &_threads_per_atom); + &atom->dev_tag, &dev_nspecial15, &dev_special15, + &ainum, &_nall, &nbor_pitch, + &_threads_per_atom); return GX; } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index cb040c630d..dc3467f692 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -145,14 +145,14 @@ class BaseAmoeba { /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - const double aewald, const double felec, const double off2_mpole, double *charge, - double *boxlo, double *prd, void **tep_ptr); + int *host_amgroup, double **host_rpole, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, const bool eatom, const bool vatom, + int &host_start, int **ilist, int **numj, const double cpu_time, + bool &success, const double aewald, const double felec, + const double off2_mpole, double *charge, double *boxlo, + double *prd, void **tep_ptr); /// Compute the real space part of the permanent field (udirect2b) with device neighboring virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall, @@ -165,8 +165,8 @@ class BaseAmoeba { const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - const double aewald, const double off2_polar, double *charge, - double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2_polar, + double *charge, double *boxlo, double *prd, void **fieldp_ptr); /// Compute the real space part of the induced field (umutual2b) with device neighboring virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall, diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 9a86be8f42..a5e3be5974 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -48,22 +48,20 @@ int HippoT::bytes_per_atom(const int max_nbors) const { template int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, - const double *host_pdamp, const double *host_thole, - const double *host_dirdamp, const int *host_amtype2class, - const double *host_special_hal, - const double *host_special_repel, - const double *host_special_disp, - const double *host_special_mpole, - const double *host_special_polar_wscale, - const double *host_special_polar_piscale, - const double *host_special_polar_pscale, - const double *host_sizpr, const double *host_dmppr, const double *host_elepr, - const double *host_csix, const double *host_adisp, - const double *host_pcore, const double *host_palpha, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const int maxspecial15, - const double cell_size, const double gpu_split, FILE *_screen, - const double polar_dscale, const double polar_uscale) { + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_repel, const double *host_special_disp, + const double *host_special_mpole, + const double *host_special_polar_wscale, + const double *host_special_polar_piscale, + const double *host_special_polar_pscale, + const double *host_sizpr, const double *host_dmppr, const double *host_elepr, + const double *host_csix, const double *host_adisp, + const double *host_pcore, const double *host_palpha, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const int maxspecial15, + const double cell_size, const double gpu_split, FILE *_screen, + const double polar_dscale, const double polar_uscale) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,hippo, @@ -133,9 +131,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<5; i++) { - dview[i].x=host_special_hal[i]; - dview[i].y=host_special_repel[i]; - dview[i].z=host_special_disp[i]; + dview[i].x=host_special_repel[i]; + dview[i].y=host_special_disp[i]; + dview[i].z=(numtyp)0; dview[i].w=(numtyp)0; } ucl_copy(sp_nonpolar,dview,5,false); @@ -211,7 +209,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, // to be able to turn on/off the udirect2b kernel (which comes before this) // Once all the kernels are ready, precompute() is needed only once // in the first kernel in a time step. - // We only need to cast uind and uinp from host to device here + // We only need to cast the necessary from host to device here // if the neighbor lists are rebuilt and other per-atom arrays // (x, type, amtype, amgroup, rpole) are ready on the device. @@ -240,7 +238,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, _c3 = c3; _c4 = c4; _c5 = c5; - const int red_blocks=repulsion(eflag,vflag); + const int red_blocks=repulsion(this->_eflag,this->_vflag); // only copy them back if this is the last kernel // otherwise, commenting out these two lines to leave the answers @@ -316,32 +314,14 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, const double cpu_time, bool &success, const double aewald, const double off2_disp, double *host_q, double *boxlo, double *prd) { - this->acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - this->set_kernel(eflag,vflag); - // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. + // We only need to cast necesary data arrays from host to device here + // because the neighbor lists are rebuilt and other per-atom arrays + // (x, type) are ready on the device. int** firstneigh = nullptr; firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, @@ -350,11 +330,11 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + success, host_q, boxlo, prd); this->_off2_disp = off2_disp; this->_aewald = aewald; - const int red_blocks=dispersion_real(eflag,vflag); + const int red_blocks=dispersion_real(this->_eflag,this->_vflag); // only copy them back if this is the last kernel // otherwise, commenting out these two lines to leave the answers @@ -427,22 +407,6 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - this->acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - this->set_kernel(eflag,vflag); - // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed // NOTE: @@ -474,7 +438,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, this->_off2_mpole = off2_mpole; this->_felec = felec; this->_aewald = aewald; - const int red_blocks=multipole_real(eflag,vflag); + const int red_blocks=multipole_real(this->_eflag,this->_vflag); // leave the answers (forces, energies and virial) on the device, // only copy them back in the last kernel (this one, or polar_real once done) @@ -486,13 +450,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); -/* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_tep[4*i]); - printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; } @@ -558,22 +516,6 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - this->acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - this->set_kernel(eflag,vflag); - // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed @@ -596,19 +538,12 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, this->_off2_polar = off2_polar; this->_aewald = aewald; - const int red_blocks=udirect2b(eflag,vflag); + const int red_blocks=udirect2b(this->_eflag,this->_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) this->_fieldp.update_host(this->_max_fieldp_size*8,false); -/* - printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", - this->_fieldp.cols(), _max_fieldp_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); - printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; } @@ -673,22 +608,6 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - this->acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - this->set_kernel(eflag,vflag); - // reallocate per-atom arrays, transfer extra data from the host // and build the neighbor lists if needed @@ -711,19 +630,12 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, this->_off2_polar = off2_polar; this->_aewald = aewald; - const int red_blocks=umutual2b(eflag,vflag); + const int red_blocks=umutual2b(this->_eflag,this->_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) this->_fieldp.update_host(this->_max_fieldp_size*8,false); -/* - printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", - this->_fieldp.cols(), _max_fieldp_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); - printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; } @@ -786,29 +698,11 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_polar, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - this->acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - this->set_kernel(eflag,vflag); - // reallocate per-atom arrays, transfer data from the host // and build the neighbor lists if needed // NOTE: // For now we invoke precompute() again here, // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. // We only need to cast uind and uinp from host to device here // if the neighbor lists are rebuilt and other per-atom arrays // (x, type, amtype, amgroup, rpole) are ready on the device. @@ -833,7 +727,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, this->_off2_polar = off2_polar; this->_felec = felec; this->_aewald = aewald; - const int red_blocks=polar_real(eflag,vflag); + const int red_blocks=polar_real(this->_eflag,this->_vflag); // only copy answers (forces, energies and virial) back from the device // in the last kernel (which is polar_real here) @@ -845,13 +739,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); -/* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); - for (int i = 0; i < 10; i++) { - numtyp4* p = (numtyp4*)(&this->_tep[4*i]); - printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); - } -*/ + return firstneigh; // nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index cb11bd4022..f38a9f4ac0 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -520,7 +520,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp valk = coeff_rep[jtype].z; // elepr[jtype]; const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)]; - numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)]; + numtyp factor_repel = sp_nonpol.x; // factor_repel = special_repel[sbmask15(j)]; if (factor_repel == (numtyp)0) continue; // intermediates involving moments and separation distance @@ -830,7 +830,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, numtyp damp3,damp5; numtyp ddamp; const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)]; - numtyp factor_disp = sp_nonpol.z; // factor_disp = special_disp[sbmask15(j)]; + numtyp factor_disp = sp_nonpol.y; // factor_disp = special_disp[sbmask15(j)]; if (ai != ak) { ai2 = ai * ai; diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index ceab20d17b..9941460bff 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -41,7 +41,6 @@ class Hippo : public BaseAmoeba { const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int *host_amtype2class, const double *host_special_mpole, - const double *host_special_hal, const double *host_special_repel, const double *host_special_disp, const double *host_special_polar_wscale, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 15cb53cdb1..4152833320 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -30,7 +30,6 @@ static Hippo HIPPOMF; int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int *host_amtype2class, - const double *host_special_hal, const double *host_special_repel, const double *host_special_disp, const double *host_special_mpole, @@ -71,8 +70,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass if (world_me==0) init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_hal, - host_special_repel, host_special_disp, + host_amtype2class, host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, host_sizpr, host_dmppr, host_elepr, @@ -97,8 +95,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass if (gpu_rank==i && world_me!=0) init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_hal, - host_special_repel, host_special_disp, + host_amtype2class, host_special_repel, host_special_disp, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, host_sizpr, host_dmppr, host_elepr, diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 014b14471e..dcdac836bd 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -53,8 +53,8 @@ enum{GORDON1,GORDON2}; int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int* host_amtype2class, - const double *host_special_hal, const double *host_special_repel, - const double *host_special_disp, const double *host_special_mpole, + const double *host_special_repel, const double *host_special_disp, + const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, @@ -203,7 +203,7 @@ void PairHippoGPU::init_style() int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, - pdamp, thole, dirdamp, amtype2class, special_hal, + pdamp, thole, dirdamp, amtype2class, special_repel, special_disp, special_mpole, special_polar_wscale, special_polar_piscale, special_polar_pscale, sizpr, dmppr, elepr, From 5a6426bf96b2aa8d69d8e4580460b82a48d7573c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 2 Oct 2021 00:56:15 -0500 Subject: [PATCH 074/181] Only transfer data arrays that are needed in each kernel --- lib/gpu/lal_base_amoeba.cpp | 53 ++++++++++--------- lib/gpu/lal_hippo.cpp | 102 ++++++++++-------------------------- 2 files changed, 55 insertions(+), 100 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 7cd410b6b8..c56cb77aa3 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -350,8 +350,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **&ilist, int **&jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd) { + bool &success, double *host_q, double *boxlo, double *prd) { acc_timers(); if (eatom) _eflag=2; else if (eflag_in) _eflag=1; @@ -509,7 +508,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, int** firstneigh = nullptr; cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); - atom->add_extra_data(); + atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -647,30 +646,34 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, int n = 0; int nstride = 4; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = rpole[i][0]; - pextra[idx+1] = rpole[i][1]; - pextra[idx+2] = rpole[i][2]; - pextra[idx+3] = rpole[i][3]; - } + if (rpole) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][0]; + pextra[idx+1] = rpole[i][1]; + pextra[idx+2] = rpole[i][2]; + pextra[idx+3] = rpole[i][3]; + } - n += nstride*_nall; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = rpole[i][4]; - pextra[idx+1] = rpole[i][5]; - pextra[idx+2] = rpole[i][6]; - pextra[idx+3] = rpole[i][8]; - } + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][4]; + pextra[idx+1] = rpole[i][5]; + pextra[idx+2] = rpole[i][6]; + pextra[idx+3] = rpole[i][8]; + } - n += nstride*_nall; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = rpole[i][9]; - pextra[idx+1] = rpole[i][12]; - pextra[idx+2] = (numtyp)amtype[i]; - pextra[idx+3] = (numtyp)amgroup[i]; + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][9]; + pextra[idx+1] = rpole[i][12]; + pextra[idx+2] = (numtyp)amtype[i]; + pextra[idx+3] = (numtyp)amgroup[i]; + } + } else { + n += 2*nstride*_nall; } n += nstride*_nall; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index a5e3be5974..5a348c9272 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -314,23 +314,12 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, const double cpu_time, bool &success, const double aewald, const double off2_disp, double *host_q, double *boxlo, double *prd) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // We only need to cast necesary data arrays from host to device here - // because the neighbor lists are rebuilt and other per-atom arrays - // (x, type) are ready on the device. - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr); + this->atom->add_extra_data(); this->_off2_disp = off2_disp; this->_aewald = aewald; @@ -344,7 +333,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, this->hd_balancer.stop_timer(); - return firstneigh; // nbor->host_jlist.begin()-host_start; + return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -407,25 +396,11 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval); + this->atom->add_extra_data(); // ------------------- Resize _tep array ------------------------ @@ -451,7 +426,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, this->_tep.update_host(this->_max_tep_size*4,false); - return firstneigh; // nbor->host_jlist.begin()-host_start; + return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -516,17 +491,11 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // all the necessary data arrays are already copied from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval); + this->atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -544,7 +513,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, this->_fieldp.update_host(this->_max_fieldp_size*8,false); - return firstneigh; //nbor->host_jlist.begin()-host_start; + return nullptr; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -608,17 +577,11 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - // reallocate per-atom arrays, transfer extra data from the host - // and build the neighbor lists if needed - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -636,7 +599,7 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, this->_fieldp.update_host(this->_max_fieldp_size*8,false); - return firstneigh; //nbor->host_jlist.begin()-host_start; + return nullptr; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -698,23 +661,12 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_polar, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + //this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); // ------------------- Resize _tep array ------------------------ @@ -740,7 +692,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, this->_tep.update_host(this->_max_tep_size*4,false); - return firstneigh; // nbor->host_jlist.begin()-host_start; + return nullptr; } // --------------------------------------------------------------------------- From 0f0f6a51de796caeeece16b9eb77f299a4672866 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 2 Oct 2021 16:02:44 -0500 Subject: [PATCH 075/181] Renamed sp_polar to sp_amoeba, and replaced special_wscale with special_hal for amoeba --- lib/gpu/lal_amoeba.cpp | 41 ++++++++++++++++------------------------- lib/gpu/lal_amoeba.cu | 24 ++++++++++++------------ lib/gpu/lal_amoeba.h | 17 ++++++----------- 3 files changed, 34 insertions(+), 48 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index b92e1bfd55..924a175cfe 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -103,30 +103,21 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, ucl_copy(coeff_amclass,host_write2,false); UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); - sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); + sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY); for (int i=0; i<5; i++) { - dview[i].x=host_special_polar_wscale[i]; + dview[i].x=host_special_hal[i]; dview[i].y=host_special_polar_piscale[i]; dview[i].z=host_special_polar_pscale[i]; dview[i].w=host_special_mpole[i]; } - ucl_copy(sp_polar,dview,5,false); - - sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); - for (int i=0; i<5; i++) { - dview[i].x=host_special_hal[i]; - dview[i].y=host_special_repel[i]; - dview[i].z=host_special_disp[i]; - dview[i].w=(numtyp)0; - } - ucl_copy(sp_nonpolar,dview,5,false); + ucl_copy(sp_amoeba,dview,5,false); _polar_dscale = polar_dscale; _polar_uscale = polar_uscale; _allocated=true; this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() - + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes(); + + sp_amoeba.row_bytes() + this->_tep.row_bytes(); return 0; } @@ -138,8 +129,7 @@ void AmoebaT::clear() { coeff_amtype.clear(); coeff_amclass.clear(); - sp_polar.clear(); - sp_nonpolar.clear(); + sp_amoeba.clear(); this->clear_atomic(); } @@ -177,13 +167,14 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); - this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, &this->_tep, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &this->_aewald, &this->_felec, - &this->_off2_mpole, &_polar_dscale, &_polar_uscale); + this->k_multipole.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &sp_amoeba, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, &this->_tep, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, &this->_felec, + &this->_off2_mpole, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); return GX; @@ -218,7 +209,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { } this->k_udirect2b.set_size(GX,BX); - this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, @@ -258,7 +249,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { } this->k_umutual2b.set_size(GX,BX); - this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &this->_aewald, @@ -297,7 +288,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { } this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, + this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index befefa8dd0..f29522084d 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -14,7 +14,7 @@ // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) -#include +//#include #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int @@ -412,7 +412,7 @@ _texture( q_tex,int2); __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict coeff, - const __global numtyp4 *restrict sp_polar, + const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, const __global int *dev_packed, const __global int *dev_short_nbor, @@ -518,7 +518,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, int jtype = pol3j.z; // amtype[j]; int jgroup = pol3j.w; // amgroup[j]; - const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)]; numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)]; // intermediates involving moments and separation distance @@ -713,7 +713,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict coeff, - const __global numtyp4 *restrict sp_polar, + const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, const __global int *dev_packed, const __global int *dev_short_nbor, @@ -824,12 +824,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, int jgroup = pol3j.w; // amgroup[j]; numtyp factor_dscale, factor_pscale; - const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)]; if (igroup == jgroup) { - factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_pscale = sp_pol.y; // sp_amoeba_piscale[sbmask15(jextra)]; factor_dscale = polar_dscale; } else { - factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_pscale = sp_pol.z; // sp_amoeba_pscale[sbmask15(jextra)]; factor_dscale = (numtyp)1.0; } @@ -931,7 +931,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict coeff, - const __global numtyp4 *restrict sp_polar, + const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, const __global int *dev_packed, const __global int *dev_short_nbor, @@ -1105,7 +1105,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict coeff, - const __global numtyp4 *restrict sp_polar, + const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, const __global int *dev_packed, const __global int *dev_short_nbor, @@ -1257,13 +1257,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp ukzp = pol5j.z; // uinp[j][2]; numtyp factor_dscale, factor_pscale, factor_uscale; - const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)]; if (igroup == jgroup) { - factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_pscale = sp_pol.y; // sp_amoeba_piscale[sbmask15(jextra)]; factor_dscale = polar_dscale; factor_uscale = polar_uscale; } else { - factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_pscale = sp_pol.z; // sp_amoeba_pscale[sbmask15(jextra)]; factor_dscale = factor_uscale = (numtyp)1.0; } diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index df72435b81..d12b79719f 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -70,17 +70,12 @@ class Amoeba : public BaseAmoeba { UCL_D_Vec coeff_amtype; /// csix = coeff_amclass.x; adisp = coeff_amclass.y; UCL_D_Vec coeff_amclass; - /// Special polar values [0-4]: - /// sp_polar.x = special_polar_wscale - /// sp_polar.y special_polar_pscale, - /// sp_polar.z = special_polar_piscale - /// sp_polar.w = special_mpole - UCL_D_Vec sp_polar; - /// Special nonpolar values [0-4]: - /// sp_nonpolar.x = special_hal - /// sp_nonpolar.y special_repel - /// sp_nonpolar.z = special_disp - UCL_D_Vec sp_nonpolar; + /// Special amoeba values [0-4]: + /// sp_amoeba.x = special_hal + /// sp_amoeba.y = special_polar_pscale, + /// sp_amoeba.z = special_polar_piscale + /// sp_amoeba.w = special_mpole + UCL_D_Vec sp_amoeba; /// If atom type constants fit in shared memory, use fast kernels bool shared_types; From 79fbbd4f33ad0eb42c5f182929c509d162a2d0d9 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 4 Oct 2021 14:40:58 -0500 Subject: [PATCH 076/181] Cleaned up the API of amoeba and hippo to remove unncessary arguments --- lib/gpu/lal_amoeba_ext.cpp | 64 +++++------------- lib/gpu/lal_base_amoeba.cpp | 84 ++++------------------- lib/gpu/lal_base_amoeba.h | 39 +++-------- lib/gpu/lal_hippo.cpp | 98 +++++---------------------- lib/gpu/lal_hippo.h | 63 ++++-------------- lib/gpu/lal_hippo_ext.cpp | 85 +++++++----------------- src/GPU/pair_amoeba_gpu.cpp | 90 +++++-------------------- src/GPU/pair_hippo_gpu.cpp | 129 ++++++++---------------------------- 8 files changed, 135 insertions(+), 517 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index b73f6c4ca6..18e1cf22f8 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -134,58 +134,28 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); } -int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, +void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr) { - return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); + const double aewald, const double off2, void **fieldp_ptr) { + AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + aewald, off2, fieldp_ptr); } -int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr) { - return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); +void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const double aewald, const double off2, void **fieldp_ptr) { + AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + aewald, off2, fieldp_ptr); } -int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double felec, const double off2, - double *host_q, double *boxlo, double *prd, void **tep_ptr) { - return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tep_ptr) { + AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr, + eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } double amoeba_gpu_bytes() { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index c56cb77aa3..5b396a641e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -487,35 +487,15 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // of the permanent field // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, +void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - - int** firstneigh = nullptr; + // all the necessary data arrays are already copied from host to device cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); - - // ------------------- Resize _fieldp array ------------------------ - - if (inum_full>_max_fieldp_size) { - _max_fieldp_size=static_cast(static_cast(inum_full)*1.10); - _fieldp.resize(_max_fieldp_size*8); - } + *fieldp_ptr=_fieldp.host.begin(); _off2_polar = off2_polar; @@ -525,8 +505,6 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, // copy field and fieldp from device to host (_fieldp store both arrays, one after another) _fieldp.update_host(_max_fieldp_size*8,false); - - return firstneigh; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -534,35 +512,15 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, // of the induced field // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, +void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - // reallocate per-atom arrays, transfer extra data from the host - // and build the neighbor lists if needed - - int** firstneigh = nullptr; + // all the necessary data arrays are already copied from host to device cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); - // ------------------- Resize _fieldp array ------------------------ - - if (inum_full>_max_fieldp_size) { - _max_fieldp_size=static_cast(static_cast(inum_full)*1.10); - _fieldp.resize(_max_fieldp_size*8); - } *fieldp_ptr=_fieldp.host.begin(); _off2_polar = off2_polar; @@ -572,41 +530,25 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, // copy field and fieldp from device to host (_fieldp store both arrays, one after another) _fieldp.update_host(_max_fieldp_size*8,false); - - return firstneigh; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double felec, - const double off2_polar, double *host_q, - double *boxlo, double *prd, void **tep_ptr) { +void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, + double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, + const double off2_polar, void **tep_ptr) { int** firstneigh = nullptr; cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); - // ------------------- Resize _tep array ------------------------ - - if (inum_full>_max_tep_size) { - _max_tep_size=static_cast(static_cast(inum_full)*1.10); - _tep.resize(_max_tep_size*4); - } *tep_ptr=_tep.host.begin(); _off2_polar = off2_polar; @@ -624,8 +566,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, // copy tep from device to host _tep.update_host(_max_tep_size*4,false); - - return firstneigh; // nbor->host_jlist.begin()-host_start; } template diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index dc3467f692..7f9777061c 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -155,45 +155,22 @@ class BaseAmoeba { double *prd, void **tep_ptr); /// Compute the real space part of the permanent field (udirect2b) with device neighboring - virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, + virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - const double aewald, const double off2_polar, - double *charge, double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2_polar, void **fieldp_ptr); /// Compute the real space part of the induced field (umutual2b) with device neighboring - virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, + virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - const double aewald, const double off2_polar, double *charge, - double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2_polar, void **fieldp_ptr); /// Compute polar real-space with device neighboring - virtual int** compute_polar_real(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, double *host_pval, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, + virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, + const bool eatom, const bool vatom, const double aewald, const double felec, const double off2_polar, - double *charge, double *boxlo, double *prd, void **tep_ptr); + void **tep_ptr); /// Compute polar real-space with host neighboring (not active for now) void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 5a348c9272..f62c46aaec 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -301,19 +301,9 @@ int HippoT::repulsion(const int eflag, const int vflag) { // Reneighbor on GPU if necessary, and then compute dispersion real-space // --------------------------------------------------------------------------- template -int** HippoT::compute_dispersion_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_disp, - double *host_q, double *boxlo, double *prd) { +int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2_disp) { // cast necessary data arrays from host to device @@ -475,21 +465,9 @@ int HippoT::multipole_real(const int eflag, const int vflag) { // of the permanent field // --------------------------------------------------------------------------- template -int** HippoT::compute_udirect2b(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double* host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, +void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double* host_pval, const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { // all the necessary data arrays are already copied from host to device @@ -497,12 +475,6 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval); this->atom->add_extra_data(); - // ------------------- Resize _fieldp array ------------------------ - - if (inum_full>this->_max_fieldp_size) { - this->_max_fieldp_size=static_cast(static_cast(inum_full)*1.10); - this->_fieldp.resize(this->_max_fieldp_size*8); - } *fieldp_ptr=this->_fieldp.host.begin(); this->_off2_polar = off2_polar; @@ -512,8 +484,6 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, // copy field and fieldp from device to host (_fieldp store both arrays, one after another) this->_fieldp.update_host(this->_max_fieldp_size*8,false); - - return nullptr; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -562,33 +532,16 @@ int HippoT::udirect2b(const int eflag, const int vflag) { // of the induced field // --------------------------------------------------------------------------- template -int** HippoT::compute_umutual2b(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, - void** fieldp_ptr) { +void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr) { // cast necessary data arrays from host to device this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); this->atom->add_extra_data(); - // ------------------- Resize _fieldp array ------------------------ - - if (inum_full>this->_max_fieldp_size) { - this->_max_fieldp_size=static_cast(static_cast(inum_full)*1.10); - this->_fieldp.resize(this->_max_fieldp_size*8); - } *fieldp_ptr=this->_fieldp.host.begin(); this->_off2_polar = off2_polar; @@ -598,8 +551,6 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, // copy field and fieldp from device to host (_fieldp store both arrays, one after another) this->_fieldp.update_host(this->_max_fieldp_size*8,false); - - return nullptr; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -646,34 +597,17 @@ int HippoT::umutual2b(const int eflag, const int vflag) { // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- template -int** HippoT::compute_polar_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *host_pval, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double felec, - const double off2_polar, double *host_q, - double *boxlo, double *prd, void **tep_ptr) { - +void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, + const double off2_polar, void **tep_ptr) { // cast necessary data arrays from host to device - //this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); this->atom->add_extra_data(); - // ------------------- Resize _tep array ------------------------ - - if (inum_full>this->_max_tep_size) { - this->_max_tep_size=static_cast(static_cast(inum_full)*1.10); - this->_tep.resize(this->_max_tep_size*4); - } *tep_ptr=this->_tep.host.begin(); this->_off2_polar = off2_polar; @@ -691,8 +625,6 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); - - return nullptr; } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index 9941460bff..492712eb85 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -72,16 +72,9 @@ class Hippo : public BaseAmoeba { double c3, double c4, double c5,void** tep_ptr); /// Compute dispersion real-space with device neighboring - int** compute_dispersion_real(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - const double aewald, const double off2_disp, double *charge, - double *boxlo, double *prd); + int** compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2_disp); /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, @@ -96,51 +89,23 @@ class Hippo : public BaseAmoeba { double *boxlo, double *prd, void **tep_ptr); /// Compute the real space part of the permanent field (udirect2b) with device neighboring - virtual int** compute_udirect2b(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, + virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double* host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, - void** fieldp_ptr); + const double aewald, const double off2_polar, void** fieldp_ptr); /// Compute the real space part of the induced field (umutual2b) with device neighboring - virtual int** compute_umutual2b(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_polar, - double *host_q, double *boxlo, double *prd, - void** fieldp_ptr); + virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2_polar, + void** fieldp_ptr); /// Compute polar real-space with device neighboring - virtual int** compute_polar_real(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, double *host_pval, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, + virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, const double aewald, const double felec, const double off2_polar, - double *charge, double *boxlo, double *prd, void **tep_ptr); + void **tep_ptr); /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 4152833320..9d3d845ad0 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -140,21 +140,11 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, cut2, c0, c1, c2, c3, c4, c5, tep_ptr); } -int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd) { - return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, sublo, subhi, - tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, off2, host_q, boxlo, prd); +void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, + double **host_rpole, const double aewald, + const double off2) { + HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole, + aewald, off2); } int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, @@ -174,58 +164,29 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); } -int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, +void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr) { - return HIPPOMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); + const double aewald, const double off2, void **fieldp_ptr) { + HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, host_pval, + aewald, off2, fieldp_ptr); } -int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr) { - return HIPPOMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr); +void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const double aewald, const double off2, void **fieldp_ptr) { + HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, + aewald, off2, fieldp_ptr); } -int** hippo_gpu_compute_polar_real(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, - double **host_rpole, double **host_uind, double **host_uinp, - double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double felec, const double off2, - double *host_q, double *boxlo, double *prd, void **tep_ptr) { - return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, - sublo, subhi, tag, nspecial, special, nspecial15, special15, - eflag, vflag, eatom, vatom, host_start, ilist, jnum, - cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); +void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + const double aewald, const double felec, const double off2, + void **tep_ptr) { + HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval, + eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } double hippo_gpu_bytes() { diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 65a4af7d64..ea7c40793c 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -74,35 +74,19 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tq_ptr); -int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, +void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2, void **fieldp_ptr); -int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, +void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2, void **fieldp_ptr); -int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, +void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double felec, const double off2, - double *host_q, double *boxlo, double *prd, void **tq_ptr); + const double aewald, const double felec, const double off2, + void **tq_ptr); double amoeba_gpu_bytes(); @@ -345,14 +329,7 @@ void PairAmoebaGPU::induce() } } } -/* - printf("GPU: cutghost = %f\n", comm->cutghost[0]); - for (i = 0; i < 10; i++) { - printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", - i, udir[i][0], udir[i][1], udir[i][2], - udirp[i][0], udirp[i][1], udirp[i][2]); - } -*/ + // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating // uopt,uoptp with a optorder+1 dimension, just optorder ?? @@ -731,17 +708,8 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) if (use_ewald) choose(POLAR_LONG); else choose(POLAR); - firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - uind, uinp, sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, - domain->boxlo, domain->prd, &fieldp_pinned); - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + amoeba_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, + aewald, off2, &fieldp_pinned); // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs @@ -933,10 +901,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; + int inum; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -956,17 +921,8 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) if (use_ewald) choose(POLAR_LONG); else choose(POLAR); - firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - uind, uinp, sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success,aewald, off2, atom->q, - domain->boxlo, domain->prd, &fieldp_pinned); - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald, + off2, &fieldp_pinned); // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (umutual1) @@ -1005,10 +961,7 @@ void PairAmoebaGPU::polar_real() int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; + int inum; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -1032,18 +985,9 @@ void PairAmoebaGPU::polar_real() double felec = 0.5 * electric / am_dielectric; - firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, - rpole, uind, uinp, sublo, subhi, - atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tq_pinned); - - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + amoeba_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, + eflag, vflag, eflag_atom, vflag_atom, + aewald, felec, off2, &tq_pinned); // reference to the tep array from GPU lib diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index dcdac836bd..0d77c67e10 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -80,16 +80,8 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, double cut2, double c0, double c1, double c2, double c3, double c4, double c5, void **tep_ptr); -int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd); +void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole, + const double aewald, const double off2); int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, @@ -100,35 +92,19 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tq_ptr); -int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, +void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, - double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr); + double *host_pval, const double aewald, const double off2, void **fieldp_ptr); -int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, +void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, double *host_q, - double *boxlo, double *prd, void **fieldp_ptr); + const double aewald, const double off2, void **fieldp_ptr); -int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall, - double **host_x, int *host_type, int *host_amtype, int *host_amgroup, +void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double felec, const double off2, - double *host_q, double *boxlo, double *prd, void **tq_ptr); + const double aewald, const double felec, const double off2, + void **tq_ptr); double hippo_gpu_bytes(); @@ -301,7 +277,6 @@ void PairHippoGPU::dispersion_real() int nall = atom->nlocal + atom->nghost; int inum, host_start; - bool success = true; int *ilist, *numneigh, **firstneigh; double sublo[3],subhi[3]; @@ -322,18 +297,7 @@ void PairHippoGPU::dispersion_real() if (use_dewald) choose(DISP_LONG); else choose(DISP); - firstneigh = hippo_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, - domain->boxlo, domain->prd); - - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + hippo_gpu_compute_dispersion_real(amtype, amgroup, rpole, aewald, off2); } /* ---------------------------------------------------------------------- @@ -377,15 +341,15 @@ void PairHippoGPU::multipole_real() double felec = electric / am_dielectric; - firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, pval, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tq_pinned); + hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, pval, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -854,9 +818,6 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) int nall = atom->nlocal + atom->nghost; int inum, host_start; - bool success = true; - int *ilist, *numneigh, **firstneigh; - double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -875,17 +836,8 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) if (use_ewald) choose(POLAR_LONG); else choose(POLAR); - firstneigh = hippo_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - uind, uinp, pval, sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, - domain->boxlo, domain->prd, &fieldp_pinned); - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + hippo_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, pval, + aewald, off2, &fieldp_pinned); // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs @@ -1078,10 +1030,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; + int inum; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -1101,17 +1050,9 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) if (use_ewald) choose(POLAR_LONG); else choose(POLAR); - firstneigh = hippo_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - uind, uinp, pval, sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success,aewald, off2, atom->q, - domain->boxlo, domain->prd, &fieldp_pinned); - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval, + aewald, off2, &fieldp_pinned); + // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (umutual1) @@ -1150,10 +1091,7 @@ void PairHippoGPU::polar_real() int eflag=1, vflag=1; int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - bool success = true; - int *ilist, *numneigh, **firstneigh; + int inum; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -1177,18 +1115,9 @@ void PairHippoGPU::polar_real() double felec = 0.5 * electric / am_dielectric; - firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, - rpole, uind, uinp, pval, sublo, subhi, - atom->tag, atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tq_pinned); - - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + hippo_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, pval, + eflag, vflag, eflag_atom, vflag_atom, + aewald, felec, off2, &tq_pinned); // reference to the tep array from GPU lib From f4900d131ac828fca7b811fd98f85e276e6a0f70 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 1 Jul 2022 16:26:25 -0500 Subject: [PATCH 077/181] Working on the multipole term on the gpu side, incorrect virials --- lib/gpu/lal_amoeba.cu | 24 +++++++-------- src/GPU/pair_amoeba_gpu.cpp | 58 +++++++------------------------------ 2 files changed, 23 insertions(+), 59 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index f29522084d..f91e973c9b 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -668,9 +668,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, // increment force-based gradient and torque on first site - f.x += frcx; - f.y += frcy; - f.z += frcz; + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; tq.x += ttmix; tq.y += ttmiy; tq.z += ttmiz; @@ -683,12 +683,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); numtyp vzz = -zr * frcz; - virial[0] += vxx; - virial[1] += vyy; - virial[2] += vzz; - virial[3] += vxy; - virial[4] += vxz; - virial[5] += vyz; + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; } } // nbor @@ -1597,9 +1597,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, frcy = frcy + depy; frcz = frcz + depz; - f.x -= frcx; - f.y -= frcy; - f.z -= frcz; + f.x += frcx; + f.y += frcy; + f.z += frcz; if (EVFLAG && vflag) { numtyp vxx = xr * frcx; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index dcbbc01185..fb9e8ef7e3 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -104,9 +104,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_hal_ready = false; // true for AMOEBA when ready gpu_repulsion_ready = false; // always false for AMOEBA gpu_dispersion_real_ready = false; // always false for AMOEBA - gpu_multipole_real_ready = true; - gpu_udirect2b_ready = true; - gpu_umutual2b_ready = true; + gpu_multipole_real_ready = false; + gpu_udirect2b_ready = false; + gpu_umutual2b_ready = false; gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); @@ -262,26 +262,16 @@ void PairAmoebaGPU::induce() double sum,sump,term; double reduce[4],allreduce[4]; - double *poli; - double **conj,**conjp; - double **vec,**vecp; - double **udir,**usum,**usump; - int debug = 1; // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() - if (use_ewald) { - choose(POLAR_LONG); - int nmine = p_kspace->nfft_owned; - memory->create(qfac,nmine,"ameoba/induce:qfac"); - } else choose(POLAR); + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); // owned atoms - double **x = atom->x; - double **f = atom->f; int nlocal = atom->nlocal; // zero out the induced dipoles at each site @@ -293,19 +283,6 @@ void PairAmoebaGPU::induce() } } - // allocation of arrays - // NOTE: not all are used by all methods - // NOTE: could be re-allocated dynamically - - memory->create(poli,nlocal,"ameoba/induce:poli"); - memory->create(conj,nlocal,3,"ameoba/induce:conj"); - memory->create(conjp,nlocal,3,"ameoba/induce:conjp"); - memory->create(vec,nlocal,3,"ameoba/induce:vec"); - memory->create(vecp,nlocal,3,"ameoba/induce:vecp"); - memory->create(udir,nlocal,3,"ameoba/induce:udir"); - memory->create(usum,nlocal,3,"ameoba/induce:usum"); - memory->create(usump,nlocal,3,"ameoba/induce:usump"); - // get the electrostatic field due to permanent multipoles dfield0c(field,fieldp); @@ -572,8 +549,6 @@ void PairAmoebaGPU::induce() } } - // NOTE: comp of b,bp and allreduce only needed if pcgprec ? - reduce[0] = b; reduce[1] = bp; MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); @@ -633,17 +608,6 @@ void PairAmoebaGPU::induce() error->warning(FLERR,"AMOEBA induced dipoles did not converge"); } - // deallocation of arrays - - memory->destroy(poli); - memory->destroy(conj); - memory->destroy(conjp); - memory->destroy(vec); - memory->destroy(vecp); - memory->destroy(udir); - memory->destroy(usum); - memory->destroy(usump); - // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used @@ -1047,12 +1011,12 @@ void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - virial_comp[0] += vxx; - virial_comp[1] += vyy; - virial_comp[2] += vzz; - virial_comp[3] += vxy; - virial_comp[4] += vxz; - virial_comp[5] += vyz; + virial_comp[0] -= vxx; + virial_comp[1] -= vyy; + virial_comp[2] -= vzz; + virial_comp[3] -= vxy; + virial_comp[4] -= vxz; + virial_comp[5] -= vyz; } } From 5dab809522927bc98a727f92a479c9c935d892c7 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 4 Jul 2022 01:38:22 -0500 Subject: [PATCH 078/181] Flipped force sign in polar_real, made sure that multipole_real is true for precompute() to be invoked, ubdirect2b() is segfault and needs work --- lib/gpu/lal_amoeba.cu | 12 ++++++------ src/GPU/pair_amoeba_gpu.cpp | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index f91e973c9b..a40f8314a5 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1609,12 +1609,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz); numtyp vzz = zr * frcz; - virial[0] += vxx; - virial[1] += vyy; - virial[2] += vzz; - virial[3] += vxy; - virial[4] += vxz; - virial[5] += vyz; + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; } } // nbor diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index fb9e8ef7e3..56c621c4dc 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -104,9 +104,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_hal_ready = false; // true for AMOEBA when ready gpu_repulsion_ready = false; // always false for AMOEBA gpu_dispersion_real_ready = false; // always false for AMOEBA - gpu_multipole_real_ready = false; - gpu_udirect2b_ready = false; - gpu_umutual2b_ready = false; + gpu_multipole_real_ready = true; // need to be true for precompute() + gpu_udirect2b_ready = false; // NEED work + gpu_umutual2b_ready = true; gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); From ee5afdc1468a79614b1743f62ab09baea9887814 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 4 Jul 2022 23:24:31 -0500 Subject: [PATCH 079/181] Updated all the gpu ready terms --- src/GPU/pair_amoeba_gpu.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 56c621c4dc..3d601fef88 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -105,9 +105,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_repulsion_ready = false; // always false for AMOEBA gpu_dispersion_real_ready = false; // always false for AMOEBA gpu_multipole_real_ready = true; // need to be true for precompute() - gpu_udirect2b_ready = false; // NEED work + gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; - gpu_polar_real_ready = true; + gpu_polar_real_ready = true; // need to be true for copying data from device back to host GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } From 83288666117b11b78d5094d74e3c2866266b1f8e Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 5 Jul 2022 11:02:31 -0500 Subject: [PATCH 080/181] Added checks for the gpu variant of pair amoeba/hippo in improper/amoeba and fix amoeba/bitorsion --- src/AMOEBA/fix_amoeba_bitorsion.cpp | 2 ++ src/AMOEBA/improper_amoeba.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/AMOEBA/fix_amoeba_bitorsion.cpp b/src/AMOEBA/fix_amoeba_bitorsion.cpp index 85a87b0452..28e055a0fe 100644 --- a/src/AMOEBA/fix_amoeba_bitorsion.cpp +++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp @@ -195,7 +195,9 @@ void FixAmoebaBiTorsion::init() pair = nullptr; pair = force->pair_match("amoeba",1,0); + if (!pair) pair = force->pair_match("amoeba/gpu",1,0); if (!pair) pair = force->pair_match("hippo",1,0); + if (!pair) pair = force->pair_match("hippo/gpu",1,0); if (!pair) error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo"); diff --git a/src/AMOEBA/improper_amoeba.cpp b/src/AMOEBA/improper_amoeba.cpp index 2b39214642..3ff4978f0f 100644 --- a/src/AMOEBA/improper_amoeba.cpp +++ b/src/AMOEBA/improper_amoeba.cpp @@ -286,7 +286,9 @@ void ImproperAmoeba::init_style() Pair *pair = NULL; pair = force->pair_match("amoeba",1,0); + if (!pair) pair = force->pair_match("amoeba/gpu",1,0); if (!pair) pair = force->pair_match("hippo",1,0); + if (!pair) pair = force->pair_match("hippo/gpu",1,0); if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo"); int tmp; From 675c2d38a3017217b662e2c516d39ea5e64ac13a Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 5 Jul 2022 14:37:26 -0500 Subject: [PATCH 081/181] Flipped sign of forces and virial terms in the hippo kernels --- lib/gpu/lal_amoeba.cu | 3 +- lib/gpu/lal_hippo.cu | 73 ++++++++++++++++++++++--------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index a40f8314a5..3b50feb6ed 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1642,8 +1642,7 @@ __kernel void k_amoeba_special15(__global int * dev_nbor, const __global tagint *restrict tag, const __global int *restrict nspecial15, const __global tagint *restrict special15, - const int inum, const int nall, - const int nbor_pitch, + const int inum, const int nall, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset, n_stride, i; atom_info(t_per_atom,ii,tid,offset); diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index f38a9f4ac0..b47e2d50e3 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -687,9 +687,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, // increment force-based gradient and torque on atom I - f.x += frcx; - f.y += frcy; - f.z += frcz; + f.x -= frcx; + f.y -= frcy; + f.z -= frcz; tq.x += ttmix; tq.y += ttmiy; tq.z += ttmiz; @@ -703,12 +703,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz); numtyp vzz = -zr * frcz; - virial[0] += vxx; - virial[1] += vyy; - virial[2] += vzz; - virial[3] += vxy; - virial[4] += vxz; - virial[5] += vyz; + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vxy; + virial[4] -= vxz; + virial[5] -= vyz; } } // nbor @@ -877,9 +877,9 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, numtyp dedx = de * xr; numtyp dedy = de * yr; numtyp dedz = de * zr; - f.x += dedx; - f.y += dedy; - f.z += dedz; + f.x -= dedx; + f.y -= dedy; + f.z -= dedz; // increment the internal virial tensor components @@ -890,12 +890,12 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, numtyp vzy = zr * dedy; numtyp vzz = zr * dedz; - virial[0] += vxx; - virial[1] += vyy; - virial[2] += vzz; - virial[3] += vyx; - virial[4] += vzx; - virial[5] += vzy; + virial[0] -= vxx; + virial[1] -= vyy; + virial[2] -= vzz; + virial[3] -= vyx; + virial[4] -= vzx; + virial[5] -= vzy; } // nbor } // ii> SBBITS & 3; int j = sj & NEIGHMASK; tagint jtag = tag[j]; + if (!which) { int offset=ii; for (int k=0; k Date: Wed, 6 Jul 2022 11:17:08 -0500 Subject: [PATCH 082/181] Removed temporary arrays in hippo/gpu induce, flipped sign of the viriral terms in torque2force in hippo/gpu --- src/GPU/pair_amoeba_gpu.cpp | 8 +++--- src/GPU/pair_hippo_gpu.cpp | 51 +++++++------------------------------ 2 files changed, 12 insertions(+), 47 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3d601fef88..fd9d99e56c 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -647,9 +647,6 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) int nall = atom->nlocal + atom->nghost; int inum, host_start; - bool success = true; - int *ilist, *numneigh, **firstneigh; - double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -674,6 +671,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() + // so no need to call ubdirect2b_cpu(). // udirect2b_cpu(); // accumulate the field and fieldp values from the GPU lib @@ -881,8 +879,8 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) if (use_ewald) choose(POLAR_LONG); else choose(POLAR); - amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald, - off2, &fieldp_pinned); + amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, + aewald, off2, &fieldp_pinned); // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (umutual1) diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 55712b3250..535be0c160 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -388,11 +388,6 @@ void PairHippoGPU::induce() double sum,sump,term; double reduce[4],allreduce[4]; - double *poli; - double **conj,**conjp; - double **vec,**vecp; - double **udir,**usum,**usump; - int debug = 1; // set cutoffs, taper coeffs, and PME params @@ -419,24 +414,11 @@ void PairHippoGPU::induce() } } - // allocation of arrays - // NOTE: not all are used by all methods - // NOTE: could be re-allocated dynamically - - memory->create(poli,nlocal,"ameoba/induce:poli"); - memory->create(conj,nlocal,3,"ameoba/induce:conj"); - memory->create(conjp,nlocal,3,"ameoba/induce:conjp"); - memory->create(vec,nlocal,3,"ameoba/induce:vec"); - memory->create(vecp,nlocal,3,"ameoba/induce:vecp"); - memory->create(udir,nlocal,3,"ameoba/induce:udir"); - memory->create(usum,nlocal,3,"ameoba/induce:usum"); - memory->create(usump,nlocal,3,"ameoba/induce:usump"); - // get the electrostatic field due to permanent multipoles dfield0c(field,fieldp); - // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only + // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only if (!gpu_udirect2b_ready) { crstyle = FIELD; @@ -705,8 +687,6 @@ void PairHippoGPU::induce() } } - // NOTE: comp of b,bp and allreduce only needed if pcgprec ? - reduce[0] = b; reduce[1] = bp; MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); @@ -763,20 +743,9 @@ void PairHippoGPU::induce() if (iter >= maxiter || eps > epsold) if (comm->me == 0) - error->warning(FLERR,"hippo induced dipoles did not converge"); + error->warning(FLERR,"HIPPO induced dipoles did not converge"); } - // deallocation of arrays - - memory->destroy(poli); - memory->destroy(conj); - memory->destroy(conjp); - memory->destroy(vec); - memory->destroy(vecp); - memory->destroy(udir); - memory->destroy(usum); - memory->destroy(usump); - // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used @@ -835,7 +804,6 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) else choose(POLAR); double *pval = atom->dvector[index_pval]; - hippo_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, pval, aewald, off2, &fieldp_pinned); @@ -1051,10 +1019,9 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) else choose(POLAR); double *pval = atom->dvector[index_pval]; - hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval, aewald, off2, &fieldp_pinned); - + // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (umutual1) @@ -1183,12 +1150,12 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr, vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); - virial_comp[0] += vxx; - virial_comp[1] += vyy; - virial_comp[2] += vzz; - virial_comp[3] += vxy; - virial_comp[4] += vxz; - virial_comp[5] += vyz; + virial_comp[0] -= vxx; + virial_comp[1] -= vyy; + virial_comp[2] -= vzz; + virial_comp[3] -= vxy; + virial_comp[4] -= vxz; + virial_comp[5] -= vyz; } } From 0c44bd10862016f21870bc585ac148db3996cd38 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 8 Jul 2022 14:45:31 -0500 Subject: [PATCH 083/181] Rearranged the order of real-space and kspace part of ufield0c(), delayed device-host transfer from umutual2b() to overlap with kspace part --- lib/gpu/lal_amoeba_ext.cpp | 4 +++ lib/gpu/lal_base_amoeba.cpp | 4 +-- lib/gpu/lal_base_amoeba.h | 7 ++++ lib/gpu/lal_hippo.cpp | 4 +-- lib/gpu/lal_hippo_ext.cpp | 4 +++ src/GPU/pair_amoeba_gpu.cpp | 71 ++++++++++++++++++++++++++++++++++++- src/GPU/pair_amoeba_gpu.h | 1 + src/GPU/pair_hippo_gpu.cpp | 71 ++++++++++++++++++++++++++++++++++++- src/GPU/pair_hippo_gpu.h | 1 + 9 files changed, 161 insertions(+), 6 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 18e1cf22f8..63ed683833 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -148,6 +148,10 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double ** aewald, off2, fieldp_ptr); } +void amoeba_gpu_update_fieldp(void **fieldp_ptr) { + AMOEBAMF.update_fieldp(fieldp_ptr); +} + void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const bool eflag_in, const bool vflag_in, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 5b396a641e..781945b77b 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -528,8 +528,8 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double const int red_blocks=umutual2b(_eflag,_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) - - _fieldp.update_host(_max_fieldp_size*8,false); + // NOTE: move this step to update_fieldp() to delay device-host transfer + //_fieldp.update_host(_max_fieldp_size*8,false); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 7f9777061c..f439e2945f 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -183,6 +183,13 @@ class BaseAmoeba { const double off2_polar, double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr); + // copy field and fieldp from device to host after umutual2b + virtual void update_fieldp(void **fieldp_ptr) { + *fieldp_ptr=_fieldp.host.begin(); + // _fieldp store both arrays, one after another + _fieldp.update_host(_max_fieldp_size*8,false); + } + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index f62c46aaec..3065bfefd4 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -549,8 +549,8 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos const int red_blocks=umutual2b(this->_eflag,this->_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) - - this->_fieldp.update_host(this->_max_fieldp_size*8,false); + // NOTE: move this step to update_fieldp() to delay device-host transfer + //this->_fieldp.update_host(this->_max_fieldp_size*8,false); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 9d3d845ad0..e7deaddbf3 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -179,6 +179,10 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **h aewald, off2, fieldp_ptr); } +void hippo_gpu_update_fieldp(void **fieldp_ptr) { + HIPPOMF.update_fieldp(fieldp_ptr); +} + void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, const bool eflag_in, const bool vflag_in, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index fd9d99e56c..1376a6bd12 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -82,6 +82,8 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const double aewald, const double off2, void **fieldp_ptr); +void amoeba_gpu_update_fieldp(void **fieldp_ptr); + void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -844,6 +846,72 @@ void PairAmoebaGPU::udirect2b_cpu() } } +/* ---------------------------------------------------------------------- + ufield0c = mutual induction via Ewald sum + ufield0c computes the mutual electrostatic field due to + induced dipole moments via Ewald summation +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::ufield0c(double **field, double **fieldp) +{ + int i,j; + double term; + + // zero field,fieldp for owned and ghost atoms + + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + for (i = 0; i < nall; i++) { + for (j = 0; j < 3; j++) { + field[i][j] = 0.0; + fieldp[i][j] = 0.0; + } + } + + // get the real space portion of the mutual field first + + if (polar_rspace_flag) umutual2b(field,fieldp); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + + // add the self-energy portion of the mutual field + + term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + field[i][j] += term*uind[i][j]; + fieldp[i][j] += term*uinp[i][j]; + } + } + + // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU + // field and fieldp may already have some nonzero values from kspace (umutual1 and self) + + amoeba_gpu_update_fieldp(&fieldp_pinned); + + int inum = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } +} + /* ---------------------------------------------------------------------- umutual2b = Ewald real mutual field via list umutual2b computes the real space contribution of the induced @@ -881,7 +949,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald, off2, &fieldp_pinned); - +/* // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (umutual1) @@ -903,6 +971,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } +*/ } /* ---------------------------------------------------------------------- diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index e0210faa68..e419ccd1a1 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -39,6 +39,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual2b(double **, double **); + virtual void ufield0c(double **, double **); virtual void polar_real(); private: diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 535be0c160..41c1355fbb 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -100,6 +100,8 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, const double aewald, const double off2, void **fieldp_ptr); +void hippo_gpu_update_fieldp(void **fieldp_ptr); + void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -983,6 +985,72 @@ void PairHippoGPU::udirect2b_cpu() } } +/* ---------------------------------------------------------------------- + ufield0c = mutual induction via Ewald sum + ufield0c computes the mutual electrostatic field due to + induced dipole moments via Ewald summation +------------------------------------------------------------------------- */ + +void PairHippoGPU::ufield0c(double **field, double **fieldp) +{ + int i,j; + double term; + + // zero field,fieldp for owned and ghost atoms + + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + + for (i = 0; i < nall; i++) { + for (j = 0; j < 3; j++) { + field[i][j] = 0.0; + fieldp[i][j] = 0.0; + } + } + + // get the real space portion of the mutual field first + + if (polar_rspace_flag) umutual2b(field,fieldp); + + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + + // add the self-energy portion of the mutual field + + term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + field[i][j] += term*uind[i][j]; + fieldp[i][j] += term*uinp[i][j]; + } + } + + // accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU + // field and fieldp may already have some nonzero values from kspace (umutual1 and self) + + hippo_gpu_update_fieldp(&fieldp_pinned); + + int inum = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } +} + /* ---------------------------------------------------------------------- umutual2b = Ewald real mutual field via list umutual2b computes the real space contribution of the induced @@ -1021,7 +1089,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) double *pval = atom->dvector[index_pval]; hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval, aewald, off2, &fieldp_pinned); - +/* // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (umutual1) @@ -1043,6 +1111,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } +*/ } /* ---------------------------------------------------------------------- diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index c7a4e75ebe..1ed1c3299d 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -40,6 +40,7 @@ class PairHippoGPU : public PairAmoeba { virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual2b(double **, double **); + virtual void ufield0c(double **, double **); virtual void polar_real(); private: From 66ee2bf98973519cb8711d1732879905d8180a2d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 14 Jul 2022 11:01:30 -0500 Subject: [PATCH 084/181] Cleaned up --- lib/gpu/lal_base_amoeba.cpp | 5 ++--- lib/gpu/lal_hippo.cpp | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 781945b77b..6f65c8c934 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -521,15 +521,14 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); - *fieldp_ptr=_fieldp.host.begin(); - _off2_polar = off2_polar; _aewald = aewald; const int red_blocks=umutual2b(_eflag,_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) // NOTE: move this step to update_fieldp() to delay device-host transfer - //_fieldp.update_host(_max_fieldp_size*8,false); + // *fieldp_ptr=_fieldp.host.begin(); + // _fieldp.update_host(_max_fieldp_size*8,false); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 3065bfefd4..79a8772c3e 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -542,15 +542,14 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); this->atom->add_extra_data(); - *fieldp_ptr=this->_fieldp.host.begin(); - this->_off2_polar = off2_polar; this->_aewald = aewald; const int red_blocks=umutual2b(this->_eflag,this->_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) // NOTE: move this step to update_fieldp() to delay device-host transfer - //this->_fieldp.update_host(this->_max_fieldp_size*8,false); + // *fieldp_ptr=this->_fieldp.host.begin(); + // this->_fieldp.update_host(this->_max_fieldp_size*8,false); } // --------------------------------------------------------------------------- From 288fd5add4de3f2aa4c7c8d98990a0fa92af440c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 19 Jul 2022 15:18:17 -0500 Subject: [PATCH 085/181] Updated the python scripts under tools/tinker to the latest version in develop --- tools/tinker/data.py | 146 ++++++++++++++++++------------------- tools/tinker/tinker2lmp.py | 113 ++++++++++++++-------------- 2 files changed, 130 insertions(+), 129 deletions(-) diff --git a/tools/tinker/data.py b/tools/tinker/data.py index 40d6582814..b75536da93 100644 --- a/tools/tinker/data.py +++ b/tools/tinker/data.py @@ -3,16 +3,17 @@ # # Copyright (2005) Sandia Corporation. Under the terms of Contract # DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains -# certain rights in this software. This software is distributed under +# certain rights in this software. This software is distributed under # the GNU General Public License. # data tool +from __future__ import print_function oneline = "Read, write, manipulate LAMMPS data files" docstr = """ d = data("data.poly") read a LAMMPS data file, can be gzipped -d = data() create an empty data file +d = data() create an empty data file d.map(1,"id",3,"x") assign names to atom columns (1-N) @@ -26,17 +27,17 @@ d.reorder("Atoms",1,3,2,4,5) reorder columns (1-N) in a data file section 1,3,2,4,5 = new order of previous columns, can delete columns this way -d.title = "My LAMMPS data file" set title of the data file +d.title = "My LAMMPS data file" set title of the data file d.headers["atoms"] = 1500 set a header value d.sections["Bonds"] = lines set a section to list of lines (with newlines) -d.delete("bonds") delete a keyword or section of data file +d.delete("bonds") delete a keyword or section of data file d.delete("Bonds") -d.replace("Atoms",5,vec) replace Nth column of section with vector -d.newxyz(dmp,1000) replace xyz in Atoms with xyz of snapshot N +d.replace("Atoms",5,vec) replace Nth column of section with vector +d.newxyz(dmp,1000) replace xyz in Atoms with xyz of snapshot N newxyz assumes id,x,y,z are defined in both data and dump files also replaces ix,iy,iz if they are defined - + index,time,flag = d.iterator(0/1) loop over single data file snapshot time,box,atoms,bonds,tris,lines = d.viz(index) return list of viz objects @@ -53,7 +54,7 @@ time,box,atoms,bonds,tris,lines = d.viz(index) return list of viz objects NULL if bonds do not exist tris = NULL lines = NULL - + d.write("data.new") write a LAMMPS data file """ @@ -65,7 +66,7 @@ d.write("data.new") write a LAMMPS data file # Variables # title = 1st line of data file -# names = dictionary with atom attributes as keys, col #s as values +# names = dictionary with atom attributes as keys, col #s as values # headers = dictionary with header name as key, value or tuple as values # sections = dictionary with section name as key, array of lines as values # nselect = 1 = # of snapshots @@ -79,13 +80,13 @@ except: PIZZA_GUNZIP = "gunzip" # Class definition -class data: +class data(object): # -------------------------------------------------------------------- def __init__(self,*list): self.nselect = 1 - + if len(list) == 0: self.title = "LAMMPS data file" self.names = {} @@ -99,7 +100,7 @@ class data: self.title = f.readline() self.names = {} - + headers = {} while 1: line = f.readline() @@ -109,16 +110,16 @@ class data: found = 0 for keyword in hkeywords: if line.find(keyword) >= 0: - found = 1 - words = line.split() - if keyword == "xlo xhi" or keyword == "ylo yhi" or \ - keyword == "zlo zhi": - headers[keyword] = (float(words[0]),float(words[1])) - elif keyword == "xy xz yz": - headers[keyword] = \ + found = 1 + words = line.split() + if keyword == "xlo xhi" or keyword == "ylo yhi" or \ + keyword == "zlo zhi": + headers[keyword] = (float(words[0]),float(words[1])) + elif keyword == "xy xz yz": + headers[keyword] = \ (float(words[0]),float(words[1]),float(words[2])) else: - headers[keyword] = int(words[0]) + headers[keyword] = int(words[0]) if not found: break @@ -128,22 +129,21 @@ class data: for pair in skeywords: keyword,length = pair[0],pair[1] if keyword == line: - found = 1 - if not headers.has_key(length): - raise StandardError, \ - "data section %s has no matching header value" % line - f.readline() + found = 1 + if length not in headers: + raise (Exception, "data section %s has no matching header value" % line) + f.readline() list = [] - for i in xrange(headers[length]): list.append(f.readline()) + for i in range(headers[length]): list.append(f.readline()) sections[keyword] = list if not found: - raise StandardError,"invalid section %s in data file" % line + raise (Exception,"invalid section %s in data file" % line) f.readline() line = f.readline() if not line: break line = line.strip() - + f.close() self.headers = headers self.sections = sections @@ -153,7 +153,7 @@ class data: def map(self,*pairs): if len(pairs) % 2 != 0: - raise StandardError, "data map() requires pairs of mappings" + raise Exception("data map() requires pairs of mappings") for i in range(0,len(pairs),2): j = i + 1 self.names[pairs[j]] = pairs[i]-1 @@ -168,7 +168,7 @@ class data: lines = self.sections[field] for line in lines: words = line.split() - values = map(float,words) + values = list(map(float,words)) array.append(values) return array elif len(list) == 2: @@ -181,7 +181,7 @@ class data: vec.append(float(words[n])) return vec else: - raise StandardError, "invalid arguments for data.get()" + raise Exception("invalid arguments for data.get()") # -------------------------------------------------------------------- # reorder columns in a data file field @@ -192,10 +192,10 @@ class data: oldlines = self.sections[name] newlines = natoms*[""] for index in order: - for i in xrange(len(newlines)): + for i in range(len(newlines)): words = oldlines[i].split() newlines[i] += words[index-1] + " " - for i in xrange(len(newlines)): + for i in range(len(newlines)): newlines[i] += "\n" self.sections[name] = newlines @@ -206,7 +206,7 @@ class data: lines = self.sections[name] newlines = [] j = icol - 1 - for i in xrange(len(lines)): + for i in range(len(lines)): line = lines[i] words = line.split() words[j] = str(vector[i]) @@ -228,48 +228,48 @@ class data: self.replace("Atoms",self.names['x']+1,x) self.replace("Atoms",self.names['y']+1,y) self.replace("Atoms",self.names['z']+1,z) - - if dm.names.has_key("ix") and self.names.has_key("ix"): + + if "ix" in dm.names and "ix" in self.names: ix,iy,iz = dm.vecs(ntime,"ix","iy","iz") self.replace("Atoms",self.names['ix']+1,ix) self.replace("Atoms",self.names['iy']+1,iy) self.replace("Atoms",self.names['iz']+1,iz) - + # -------------------------------------------------------------------- # delete header value or section from data file def delete(self,keyword): - if self.headers.has_key(keyword): del self.headers[keyword] - elif self.sections.has_key(keyword): del self.sections[keyword] - else: raise StandardError, "keyword not found in data object" + if keyword in self.headers: del self.headers[keyword] + elif keyword in self.sections: del self.sections[keyword] + else: raise Exception("keyword not found in data object") # -------------------------------------------------------------------- # write out a LAMMPS data file def write(self,file): f = open(file,"w") - print >>f,self.title - + print(self.title, file=f) + # write any keywords in standard list hkeywords # in the order they are in hkeywords # then write any extra keywords at end of header section - + for keyword in hkeywords: - if self.headers.has_key(keyword): + if keyword in self.headers: if keyword == "xlo xhi" or keyword == "ylo yhi" or \ keyword == "zlo zhi": - pair = self.headers[keyword] - print >>f,pair[0],pair[1],keyword + pair = self.headers[keyword] + print(pair[0],pair[1],keyword, file=f) elif keyword == "xy xz yz": - triple = self.headers[keyword] - print >>f,triple[0],triple[1],triple[2],keyword + triple = self.headers[keyword] + print(triple[0],triple[1],triple[2],keyword, file=f) else: - print >>f,self.headers[keyword],keyword + print(self.headers[keyword],keyword, file=f) - for keyword in self.headers.keys(): + for keyword in list(self.headers.keys()): if keyword not in hkeywords: - print >>f,self.headers[keyword],keyword + print(self.headers[keyword],keyword, file=f) # write any sections in standard list skeywords # in the order they are in skeywords @@ -277,18 +277,18 @@ class data: for pair in skeywords: keyword = pair[0] - if self.sections.has_key(keyword): - print >>f,"\n%s\n" % keyword + if keyword in self.sections: + print("\n%s\n" % keyword, file=f) for line in self.sections[keyword]: - print >>f,line, + print(line, end='', file=f) skeyfirst = [pair[0] for pair in skeywords] - - for keyword in self.sections.keys(): + + for keyword in list(self.sections.keys()): if keyword not in skeyfirst: - print >>f,"\n%s\n" % keyword + print("\n%s\n" % keyword, file=f) for line in self.sections[keyword]: - print >>f,line, + print(line, end='', file=f) f.close() @@ -304,20 +304,20 @@ class data: def findtime(self,n): if n == 0: return 0 - raise StandardError, "no step %d exists" % (n) - + raise(Exception, "no step %d exists" % (n)) + # -------------------------------------------------------------------- # return list of atoms and bonds to viz for data object def viz(self,isnap): - if isnap: raise StandardError, "cannot call data.viz() with isnap != 0" - + if isnap: raise Exception("cannot call data.viz() with isnap != 0") + id = self.names["id"] type = self.names["type"] x = self.names["x"] y = self.names["y"] z = self.names["z"] - + xlohi = self.headers["xlo xhi"] ylohi = self.headers["ylo yhi"] zlohi = self.headers["zlo zhi"] @@ -336,7 +336,7 @@ class data: # assumes atoms are sorted so can lookup up the 2 atoms in each bond bonds = [] - if self.sections.has_key("Bonds"): + if "Bonds" in self.sections: bondlines = self.sections["Bonds"] for line in bondlines: words = line.split() @@ -349,8 +349,8 @@ class data: float(atom1words[z]), float(atom2words[x]),float(atom2words[y]), float(atom2words[z]), - float(atom1words[type]),float(atom2words[type])]) - + float(atom1words[type]),float(atom2words[type])]) + tris = [] lines = [] return 0,box,atoms,bonds,tris,lines @@ -375,8 +375,8 @@ class data: hkeywords = ["atoms","ellipsoids","lines","triangles","bodies", "bonds","angles","dihedrals","impropers", - "atom types","bond types","angle types","dihedral types", - "improper types", + "atom types","bond types","angle types","dihedral types", + "improper types", "xlo xhi","ylo yhi","zlo zhi","xy xz yz"] skeywords = [["Masses","atom types"], @@ -384,14 +384,14 @@ skeywords = [["Masses","atom types"], ["Lines","lines"],["Triangles","triangles"],["Bodies","bodies"], ["Velocities","atoms"], ["Bonds","bonds"], - ["Angles","angles"], + ["Angles","angles"], ["Dihedrals","dihedrals"], - ["Impropers","impropers"], + ["Impropers","impropers"], ["Pair Coeffs","atom types"], - ["Bond Coeffs","bond types"], + ["Bond Coeffs","bond types"], ["Angle Coeffs","angle types"], - ["Dihedral Coeffs","dihedral types"], - ["Improper Coeffs","improper types"], + ["Dihedral Coeffs","dihedral types"], + ["Improper Coeffs","improper types"], ["BondBond Coeffs","angle types"], ["BondAngle Coeffs","angle types"], ["MiddleBondTorsion Coeffs","dihedral types"], diff --git a/tools/tinker/tinker2lmp.py b/tools/tinker/tinker2lmp.py index 565d3f23fe..fe80be9a14 100644 --- a/tools/tinker/tinker2lmp.py +++ b/tools/tinker/tinker2lmp.py @@ -15,6 +15,7 @@ # Author: Steve Plimpton +from __future__ import print_function import sys,os,math from data import data @@ -29,20 +30,20 @@ DELTA = 0.001 # delta on LAMMPS shrink-wrap box size, in Angstroms def error(txt=""): if not txt: - print "Syntax: tinker2lmp.py -switch args ..." - print " -xyz file" - print " -amoeba file" - print " -hippo file" - print " -data file" - print " -bitorsion file" - print " -nopbc" - print " -pbc xhi yhi zhi" - else: print "ERROR:",txt + print("Syntax: tinker2lmp.py -switch args ...") + print(" -xyz file") + print(" -amoeba file") + print(" -hippo file") + print(" -data file") + print(" -bitorsion file") + print(" -nopbc") + print(" -pbc xhi yhi zhi") + else: print("ERROR:",txt) #sys.exit() # read and store values from a Tinker xyz file -class XYZfile: +class XYZfile(object): def __init__(self,file): lines = open(file,'r').readlines() header = lines[0] @@ -212,7 +213,7 @@ class XYZfile: def output(self,outfile): fp = open(outfile,'w') words = self.header.split() - print >>fp,self.natoms,"replicated",' '.join(words[1:]) + print(self.natoms,"replicated",' '.join(words[1:]), file=fp) id = self.id label = self.label @@ -225,9 +226,9 @@ class XYZfile: # NOTE: worry about formatting of line for i in range(self.natoms): - print >>fp,i+1,label[i],x[i],y[i],z[i],type[i], - for j in bonds[i]: print >>fp,j, - print >>fp + print(i+1,label[i],x[i],y[i],z[i],type[i], end=' ', file=fp) + for j in bonds[i]: print(j, end=' ', file=fp) + print(file=fp) fp.close() @@ -255,7 +256,7 @@ class XYZfile: # scalar force field params in Force Field Definition section # bond, angle, dihedral coeffs indexed by Tinker classes -class PRMfile: +class PRMfile(object): def __init__(self,file): lines = open(file,'r').readlines() self.nlines = len(lines) @@ -519,7 +520,7 @@ class PRMfile: error("torsion does not have triplets of params: %d %d %d %d" % \ (class1,class2,class3,class4)) - mfourier = (len(words)-5) / 3 + mfourier = int((len(words)-5)/3) oneparams = [class1,class2,class3,class4,mfourier] for iset in range(mfourier): @@ -743,7 +744,7 @@ if pbcflag: else: xlo = ylo = zlo = BIG xhi = yhi = zhi = -BIG - for i in xrange(natoms): + for i in range(natoms): xlo = min(xlo,float(x[i])) ylo = min(ylo,float(y[i])) zlo = min(zlo,float(z[i])) @@ -1097,11 +1098,11 @@ for i,one in enumerate(alist): nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass) if nbonds != 3: - print "Center angle atom has wrong bond count" - print " angle atom IDs:",atom1,atom2,atom3 - print " angle atom classes:",c1,c2,c3 - print " Tinker FF file param options:",len(params[3]) - print " Nbonds and hydrogen count:",nbonds,hcount + print("Center angle atom has wrong bond count") + print(" angle atom IDs:",atom1,atom2,atom3) + print(" angle atom classes:",c1,c2,c3) + print(" Tinker FF file param options:",len(params[3])) + print(" Nbonds and hydrogen count:",nbonds,hcount) #sys.exit() NOTE: allow this for now if hcount == 0: which = 1 @@ -1109,22 +1110,22 @@ for i,one in enumerate(alist): which = 2 m += 1 - print "3-bond angle" - print " angle atom IDs:",atom1,atom2,atom3 - print " angle atom classes:",c1,c2,c3 - print " Tinker FF file param options:",len(params[3]) - print " Nbonds and hydrogen count:",nbonds,hcount - print " which:",which,m + print("3-bond angle") + print(" angle atom IDs:",atom1,atom2,atom3) + print(" angle atom classes:",c1,c2,c3) + print(" Tinker FF file param options:",len(params[3])) + print(" Nbonds and hydrogen count:",nbonds,hcount) + print(" which:",which,m) elif len(params[3]) == 3: nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass) if nbonds != 4: - print "Center angle atom has wrong bond count" - print " angle atom IDs:",atom1,atom2,atom3 - print " angle atom classes:",c1,c2,c3 - print " Tinker FF file param options:",len(params[3]) - print " Nbonds and hydrogen count:",nbonds,hcount + print("Center angle atom has wrong bond count") + print(" angle atom IDs:",atom1,atom2,atom3) + print(" angle atom classes:",c1,c2,c3) + print(" Tinker FF file param options:",len(params[3])) + print(" Nbonds and hydrogen count:",nbonds,hcount) #sys.exit() NOTE: allow this for now if hcount == 0: which = 1 @@ -1170,7 +1171,7 @@ for itype in range(len(aparams)): elif (c3,c2,c1) in badict: n1,n2,r1,r2 = badict[(c3,c2,c1)] else: - print "Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3) + print("Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3)) n1,n2,r1,r2 = 4*[0.0] baparams.append((n1,n2,r1,r2)) @@ -1600,17 +1601,17 @@ if nbitorsions: nbitorsions) fp = open(bitorsionfile,'w') - print >>fp,"Tinker BiTorsion parameter file for fix bitorsion\n" - print >>fp,"%d bitorsion types" % len(bitorsionparams) + print("Tinker BiTorsion parameter file for fix bitorsion\n", file=fp) + print("%d bitorsion types" % len(bitorsionparams), file=fp) itype = 0 for nx,ny,array in bitorsionparams: itype += 1 - print >>fp - print >>fp,itype,nx,ny + print(file=fp) + print(itype,nx,ny, file=fp) for ix in range(nx): for iy in range(ny): xgrid,ygrid,value = array[ix][iy] - print >>fp," ",xgrid,ygrid,value + print(" ",xgrid,ygrid,value, file=fp) fp.close() lines = [] @@ -1624,21 +1625,21 @@ d.write(datafile) # print stats to screen -print "Natoms =",natoms -print "Ntypes =",ntypes -print "Tinker XYZ types =",len(tink2lmp) -print "Tinker PRM types =",prm.ntypes +print("Natoms =",natoms) +print("Ntypes =",ntypes) +print("Tinker XYZ types =",len(tink2lmp)) +print("Tinker PRM types =",prm.ntypes) #print "Tinker groups =",ngroups -print "Nmol =",nmol -print "Nbonds =",nbonds -print "Nangles =",nangles -print "Ndihedrals =",ndihedrals -print "Nimpropers =",nimpropers -print "Npitorsions =",npitorsions -print "Nbitorsions =",nbitorsions -print "Nbondtypes =",len(bparams) -print "Nangletypes =",len(aparams) -print "Ndihedraltypes =",len(dparams) -print "Nimpropertypes =",len(oparams) -print "Npitorsiontypes =",len(pitorsionparams) -print "Nbitorsiontypes =",len(bitorsionparams) +print("Nmol =",nmol) +print("Nbonds =",nbonds) +print("Nangles =",nangles) +print("Ndihedrals =",ndihedrals) +print("Nimpropers =",nimpropers) +print("Npitorsions =",npitorsions) +print("Nbitorsions =",nbitorsions) +print("Nbondtypes =",len(bparams)) +print("Nangletypes =",len(aparams)) +print("Ndihedraltypes =",len(dparams)) +print("Nimpropertypes =",len(oparams)) +print("Npitorsiontypes =",len(pitorsionparams)) +print("Nbitorsiontypes =",len(bitorsionparams)) From 93784f35e329c4068e7b904b0da27edf0b6a2bdb Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 25 Jul 2022 15:34:44 -0500 Subject: [PATCH 086/181] Added ucl_erfc to the opencl, cuda and hip backends; reverted to using erfc instead of approximation to ensure double-precision matches --- lib/gpu/lal_amoeba.cu | 22 ++++++++++++++++------ lib/gpu/lal_hippo.cu | 20 +++++++++++++++----- lib/gpu/lal_pre_cuda_hip.h | 8 +++++--- lib/gpu/lal_preprocessor.h | 1 + src/GPU/pair_amoeba_gpu.cpp | 23 ----------------------- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 3b50feb6ed..d445305bb2 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -607,10 +607,13 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); @@ -800,7 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); + numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; @@ -850,10 +853,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1005,7 +1010,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); + numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; @@ -1031,10 +1036,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1298,10 +1305,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index b47e2d50e3..4f31650f73 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1124,10 +1124,13 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); @@ -1400,10 +1403,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1551,7 +1556,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, //if (r2>off2) continue; numtyp r = ucl_sqrt(r2); - numtyp rinv = ucl_recip(r); + numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; @@ -1589,10 +1594,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; for (int m = 1; m <= 3; m++) { @@ -1838,10 +1845,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - //bn[0] = erfc(ralpha) / r; bn[0] = _erfc * rinv; + */ + bn[0] = ucl_erfc(ralpha) * rinv; + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h index 47a005b998..03c4fce85e 100644 --- a/lib/gpu/lal_pre_cuda_hip.h +++ b/lib/gpu/lal_pre_cuda_hip.h @@ -179,12 +179,15 @@ #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs +#define ucl_recip(x) ((numtyp)1.0/(x)) #define ucl_rsqrt rsqrt #define ucl_sqrt sqrt -#define ucl_recip(x) ((numtyp)1.0/(x)) +#define ucl_erfc erfc #else +#define ucl_exp expf +#define ucl_powr powf #define ucl_atan atanf #define ucl_cbrt cbrtf #define ucl_ceil ceilf @@ -192,8 +195,7 @@ #define ucl_recip(x) ((numtyp)1.0/(x)) #define ucl_rsqrt rsqrtf #define ucl_sqrt sqrtf -#define ucl_exp expf -#define ucl_powr powf +#define ucl_erfc erfcf #endif diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 2ef8af0911..c734e67b98 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -166,6 +166,7 @@ #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs +#define ucl_erfc erfc #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 1376a6bd12..3b0268f6b4 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -949,29 +949,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald, off2, &fieldp_pinned); -/* - // accumulate the field and fieldp values from the GPU lib - // field and fieldp may already have some nonzero values from kspace (umutual1) - - int nlocal = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; - - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } - - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; - } -*/ } /* ---------------------------------------------------------------------- From a6066bab4d7dce10f985c1ec24b3bf45f0de0b82 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 29 Jul 2022 13:01:57 -0500 Subject: [PATCH 087/181] Called the induce real-space term before the kspace term --- src/AMOEBA/amoeba_induce.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index d78e2d3262..1fce9be736 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -576,14 +576,14 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) } } - // get the reciprocal space part of the mutual field - - if (polar_kspace_flag) umutual1(field,fieldp); - // get the real space portion of the mutual field if (polar_rspace_flag) umutual2b(field,fieldp); + // get the reciprocal space part of the mutual field + + if (polar_kspace_flag) umutual1(field,fieldp); + // add the self-energy portion of the mutual field term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; From e980838ae2a6c8218175edb77d003f5801abe2ef Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 2 Aug 2022 16:45:06 -0500 Subject: [PATCH 088/181] Added timings for real-space and k-space portions for the terms --- src/AMOEBA/amoeba_induce.cpp | 24 +++++++++++++++++++ src/AMOEBA/amoeba_multipole.cpp | 12 ++++++++++ src/AMOEBA/amoeba_polar.cpp | 12 ++++++++++ src/AMOEBA/pair_amoeba.cpp | 42 +++++++++++++++++++++++++++++++++ src/AMOEBA/pair_amoeba.h | 5 ++++ src/GPU/pair_amoeba_gpu.cpp | 12 ++++++++++ 6 files changed, 107 insertions(+) diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 1fce9be736..01491a8708 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -564,6 +564,8 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) int i,j; double term; + double time0,time1,time2; + // zero field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; @@ -576,13 +578,18 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) } } + MPI_Barrier(world); + time0 = MPI_Wtime(); + // get the real space portion of the mutual field if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = MPI_Wtime(); // get the reciprocal space part of the mutual field if (polar_kspace_flag) umutual1(field,fieldp); + time2 = MPI_Wtime(); // add the self-energy portion of the mutual field @@ -593,6 +600,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) fieldp[i][j] += term*uinp[i][j]; } } + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; } /* ---------------------------------------------------------------------- @@ -801,6 +813,8 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) int i,j; double term; + double time0,time1,time2; + // zero out field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; @@ -815,7 +829,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the reciprocal space part of the permanent field + MPI_Barrier(world); + time0 = MPI_Wtime(); + if (polar_kspace_flag) udirect1(field); + time1 = MPI_Wtime(); for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { @@ -826,6 +844,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the real space portion of the permanent field if (polar_rspace_flag) udirect2b(field,fieldp); + time2 = MPI_Wtime(); // get the self-energy portion of the permanent field @@ -836,6 +855,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) fieldp[i][j] += term*rpole[i][j+1]; } } + + // accumulate timing information + + time_direct_kspace += time1 - time0; + time_direct_rspace += time2 - time1; } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index 886a64f150..603de5884d 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -54,6 +54,8 @@ void PairAmoeba::multipole() double qixx,qixy,qixz,qiyy,qiyz,qizz; double cii,dii,qii; + double time0,time1,time2; + // set cutoffs, taper coeffs, and PME params if (use_ewald) choose(MPOLE_LONG); @@ -77,13 +79,18 @@ void PairAmoeba::multipole() felec = electric / am_dielectric; + MPI_Barrier(world); + time0 = MPI_Wtime(); + // compute the real space part of the Ewald summation if (mpole_rspace_flag) multipole_real(); + time1 = MPI_Wtime(); // compute the reciprocal space part of the Ewald summation if (mpole_kspace_flag) multipole_kspace(); + time2 = MPI_Wtime(); // compute the Ewald self-energy term over all the atoms @@ -108,6 +115,11 @@ void PairAmoeba::multipole() e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0)); empole += e; } + + // accumulate timing information + + time_mpole_rspace += time1 - time0; + time_mpole_kspace += time2 - time1; } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index 646d045504..6312de77e9 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -52,6 +52,8 @@ void PairAmoeba::polar() double fix[3],fiy[3],fiz[3]; double tep[3]; + double time0,time1,time2; + // set cutoffs, taper coeffs, and PME params if (use_ewald) choose(POLAR_LONG); @@ -73,11 +75,16 @@ void PairAmoeba::polar() // compute the real space part of the dipole interactions + MPI_Barrier(world); + time0 = MPI_Wtime(); + if (polar_rspace_flag) polar_real(); + time1 = MPI_Wtime(); // compute the reciprocal space part of dipole interactions if (polar_kspace_flag) polar_kspace(); + time2 = MPI_Wtime(); // compute the Ewald self-energy torque and virial terms @@ -130,6 +137,11 @@ void PairAmoeba::polar() virpolar[4] -= vxz; virpolar[5] -= vyz; } + + // accumulate timing information + + time_polar_rspace += time1 - time0; + time_polar_kspace += time2 - time1; } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index bb0734cf41..c62aac87e9 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -342,6 +342,11 @@ void PairAmoeba::compute(int eflag, int vflag) if (update->ntimestep <= update->beginstep+1) { time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0; time_induce = time_polar = time_qxfer = 0.0; + + time_mpole_rspace = time_mpole_kspace = 0.0; + time_direct_rspace = time_direct_kspace = 0.0; + time_mutual_rspace = time_mutual_kspace = 0.0; + time_polar_rspace = time_polar_kspace = 0.0; } double time0,time1,time2,time3,time4,time5,time6,time7,time8; @@ -511,6 +516,32 @@ void PairAmoeba::finish() MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_qxfer = ave/comm->nprocs; + // real-space/kspace breakdown + + MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mpole_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mpole_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_direct_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_direct_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_kspace = ave/comm->nprocs; + + MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_polar_rspace = ave/comm->nprocs; + + MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_polar_kspace = ave/comm->nprocs; + double time_total = (time_init + time_hal + time_repulse + time_disp + time_mpole + time_induce + time_polar + time_qxfer) / 100.0; @@ -529,6 +560,17 @@ void PairAmoeba::finish() if (!amoeba) utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total); utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0); + + utils::logmesg(lmp," Real-space timing breakdown:\n"); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); + utils::logmesg(lmp," K-space timing breakdown:\n"); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); } } diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 2431b99859..8195b1d16f 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -88,6 +88,11 @@ class PairAmoeba : public Pair { double time_init,time_hal,time_repulse,time_disp; double time_mpole,time_induce,time_polar,time_qxfer; + double time_mpole_rspace,time_mpole_kspace; + double time_direct_rspace,time_direct_kspace; + double time_mutual_rspace,time_mutual_kspace; + double time_polar_rspace,time_polar_kspace; + // energy/virial components double ehal,erepulse,edisp,epolar,empole,eqxfer; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3b0268f6b4..582eb7b595 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -857,6 +857,8 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) int i,j; double term; + double time0,time1,time2; + // zero field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; @@ -871,11 +873,16 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) // get the real space portion of the mutual field first + MPI_Barrier(world); + time0 = MPI_Wtime(); + if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = MPI_Wtime(); // get the reciprocal space part of the mutual field if (polar_kspace_flag) umutual1(field,fieldp); + time2 = MPI_Wtime(); // add the self-energy portion of the mutual field @@ -910,6 +917,11 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; } /* ---------------------------------------------------------------------- From a54f0b684dbda1adf3b1d918302ef5540fb5a24f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 3 Aug 2022 10:56:52 -0500 Subject: [PATCH 089/181] Moved temp variables inside the loop over neighbors --- lib/gpu/lal_amoeba.cu | 34 ++++++++++++++++++++++++---------- lib/gpu/lal_hippo.cu | 15 +++++++++++---- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index d445305bb2..173770f666 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -448,11 +448,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); if (ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + int m; for (m = 1; m < 6; m++) { bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; @@ -625,6 +627,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, } for (m = 0; m < 6; m++) bn[m] *= felec; + numtyp term1,term2,term3; + numtyp term4,term5,term6; + term1 = ci*ck; term2 = ck*dir - ci*dkr + dik; term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk); @@ -757,8 +762,8 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, nbor_mem = dev_short_nbor; } - numtyp bn[4],bcn[3]; - numtyp fid[3],fip[3]; + //numtyp bn[4],bcn[3]; + //numtyp fid[3],fip[3]; const numtyp4 pol1i = polar1[i]; numtyp dix = pol1i.y; // rpole[i][1]; @@ -853,6 +858,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp bn[4],bcn[3]; /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; @@ -900,6 +906,8 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + + numtyp fid[3]; fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; @@ -908,6 +916,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + numtyp fip[3]; fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; @@ -980,8 +989,8 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } int itype,igroup; - numtyp bn[4],bcn[3]; - numtyp fid[3],fip[3]; + //numtyp bn[4],bcn[3]; + //numtyp fid[3],fip[3]; itype = polar3[i].z; // amtype[i]; igroup = polar3[i].w; // amgroup[i]; @@ -1036,6 +1045,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp bn[4]; /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; @@ -1068,6 +1078,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } numtyp scalek = factor_uscale; + numtyp bcn[3]; bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; @@ -1081,10 +1092,13 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, //if (i==0 && j == 10) // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); + + numtyp fid[3]; fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; + numtyp fip[3]; fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 4f31650f73..3897a9e5ad 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1303,8 +1303,8 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, nbor_mem = dev_short_nbor; } - numtyp bn[4],bcn[3]; - numtyp fid[3],fip[3]; + //numtyp bn[4],bcn[3]; + //numtyp fid[3],fip[3]; const numtyp4 pol1i = polar1[i]; numtyp dix = pol1i.y; // rpole[i][1]; @@ -1403,6 +1403,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp bn[4],bcn[3]; /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; @@ -1429,6 +1430,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5; numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7; rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3; + numtyp fid[3]; fid[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) - rr3k*dkx + (numtyp)2.0*rr5k*qkx; fid[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) - @@ -1445,6 +1447,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5; rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7; rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3; + numtyp fip[3]; fip[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) - rr3k*dkx + (numtyp)2.0*rr5k*qkx; fip[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) - @@ -1524,8 +1527,8 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, } int itype,igroup; - numtyp bn[4],bcn[3]; - numtyp fid[3],fip[3]; + //numtyp bn[4],bcn[3]; + //numtyp fid[3],fip[3]; itype = polar3[i].z; // amtype[i]; igroup = polar3[i].w; // amgroup[i]; @@ -1594,6 +1597,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp bn[4]; /* numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; @@ -1627,10 +1631,13 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, //if (i==0 && j == 10) // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); + + numtyp fid[3]; fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz; fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz; + numtyp fip[3]; fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp; fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp; fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp; From aad4e417f9a9adfeceade97d66e9e36e26ea5aac Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 3 Aug 2022 12:33:48 -0500 Subject: [PATCH 090/181] Moved temp variables inside neighbor loops --- lib/gpu/lal_amoeba.cu | 25 ++++++++++++++++++++++--- lib/gpu/lal_hippo.cu | 28 ++++++++++++++++++++-------- src/AMOEBA/pair_amoeba.cpp | 20 ++++++++++---------- 3 files changed, 52 insertions(+), 21 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 173770f666..6f0c7c8433 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -621,7 +621,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, int m; for (m = 1; m < 6; m++) { - bfac = (numtyp) (m+m-1); + numtyp bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; } @@ -1170,7 +1170,8 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, //numtyp4 xi__; if (ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); for (m = 1; m <= 4; m++) { - bfac = (numtyp) (m+m-1); + numtyp bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 3897a9e5ad..5b88ac4955 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -955,10 +955,10 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, if (ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); for (m = 1; m < 6; m++) { - bfac = (numtyp) (m+m-1); + numtyp bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; } for (m = 0; m < 6; m++) bn[m] *= felec; + numtyp term1,term2,term3; + numtyp term4,term5,term6; + term1 = corei*corek; numtyp term1i = corek*vali; numtyp term2i = corek*dir; @@ -1711,13 +1715,15 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, //numtyp4 xi__; if (ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); for (m = 1; m <= 4; m++) { - bfac = (numtyp) (m+m-1); + numtyp bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv; } @@ -1875,6 +1883,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp sc3 = (numtyp)1.0; numtyp sc5 = (numtyp)1.0; numtyp sc7 = (numtyp)1.0; + numtyp rc3[3],rc5[3],rc7[3]; for (k = 0; k < 3; k++) { rc3[k] = (numtyp)0.0; rc5[k] = (numtyp)0.0; @@ -2064,6 +2073,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp frcy = (numtyp)-2.0 * depy; numtyp frcz = (numtyp)-2.0 * depz; + numtyp term1,term2,term3; + //numtyp term4,term5,term6,term7; + // get the dEp/dR terms used for direct polarization force // poltyp == MUTUAL && hippo // tixx and tkxx diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index c62aac87e9..d5270af450 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -561,16 +561,16 @@ void PairAmoeba::finish() utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total); utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0); - utils::logmesg(lmp," Real-space timing breakdown:\n"); - utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); - utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); - utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); - utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); - utils::logmesg(lmp," K-space timing breakdown:\n"); - utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); - utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); - utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); - utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); + utils::logmesg(lmp," Real-space timing breakdown:\n"); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); + utils::logmesg(lmp," K-space timing breakdown:\n"); + utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); + utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); + utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); } } From 538aa13693bb5a9d9749e6361eda70ade3ef208c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 10 Aug 2022 16:21:30 -0500 Subject: [PATCH 091/181] Only transfer data that is needed for umutual2b; allowed convolution and kspace term umutual1 to be overridden by the gpu counterparts --- lib/gpu/lal_base_amoeba.cpp | 3 ++- src/AMOEBA/amoeba_convolution.h | 6 +++--- src/AMOEBA/pair_amoeba.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 6f65c8c934..3b67ee31a1 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -518,7 +518,8 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double void** fieldp_ptr) { // all the necessary data arrays are already copied from host to device - cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr); atom->add_extra_data(); _off2_polar = off2_polar; diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h index 270a501a71..00f2b8ed91 100644 --- a/src/AMOEBA/amoeba_convolution.h +++ b/src/AMOEBA/amoeba_convolution.h @@ -47,7 +47,7 @@ class AmoebaConvolution : protected Pointers { FFT_SCALAR *pre_convolution(); void *post_convolution(); - private: + protected: int which; // caller name for convolution being performed int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick int nbrick_owned; // owned grid points in brick decomp @@ -71,9 +71,9 @@ class AmoebaConvolution : protected Pointers { void *zero_3d(); void *zero_4d(); FFT_SCALAR *pre_convolution_3d(); - FFT_SCALAR *pre_convolution_4d(); + virtual FFT_SCALAR *pre_convolution_4d(); void *post_convolution_3d(); - void *post_convolution_4d(); + virtual void *post_convolution_4d(); void kspacebbox(double, double *); void procs2grid2d(int, int, int, int &, int &); diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 8195b1d16f..93978ab1f2 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -381,7 +381,7 @@ class PairAmoeba : public Pair { void ufield0c(double **, double **); void uscale0b(int, double **, double **, double **, double **); void dfield0c(double **, double **); - void umutual1(double **, double **); + virtual void umutual1(double **, double **); virtual void umutual2b(double **, double **); void udirect1(double **); virtual void udirect2b(double **, double **); From c13f825648ea36523da2194691cc1be1cd8eca63 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 10 Aug 2022 16:24:20 -0500 Subject: [PATCH 092/181] Added AmoebaConvolutionGPU class: need to replace fft compute with the GPU-accelerated backend --- src/GPU/amoeba_convolution_gpu.cpp | 140 +++++++++++++++++++++++++++++ src/GPU/amoeba_convolution_gpu.h | 34 +++++++ 2 files changed, 174 insertions(+) create mode 100644 src/GPU/amoeba_convolution_gpu.cpp create mode 100644 src/GPU/amoeba_convolution_gpu.h diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp new file mode 100644 index 0000000000..976a115fe1 --- /dev/null +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -0,0 +1,140 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "amoeba_convolution_gpu.h" +#include "comm.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "gridcomm.h" + +using namespace LAMMPS_NS; + +#define SCALE 0 + +// External functions from GPU library + +//int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int size, const int flag); + +/* ---------------------------------------------------------------------- + partition an FFT grid across processors + both for a brick and FFT x pencil decomposition + nx,nz,nz = global FFT grid size + order = size of stencil in each dimension that maps atoms to grid + adapted from PPPM::set_grid_local() +------------------------------------------------------------------------- */ + +AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair, + int nx_caller, int ny_caller, int nz_caller, + int order_caller, int which_caller) : + AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller, + which_caller) +{ +} + +/* ---------------------------------------------------------------------- + perform pre-convolution grid operations for 4d cgrid_brick array +------------------------------------------------------------------------- */ + +FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() +{ + int ix,iy,iz,n; + + // reverse comm for 4d brick grid + ghosts + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE GridComm"); +#endif + + gc->reverse_comm(GridComm::PAIR,amoeba,2,sizeof(FFT_SCALAR),which, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm"); + debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm"); +#endif + // copy owned 4d brick grid values to FFT grid + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) { + cfft[n++] = cgrid_brick[iz][iy][ix][0]; + cfft[n++] = cgrid_brick[iz][iy][ix][1]; + } + + // remap FFT grid from brick to x pencil partitioning + // NOTE: could just setup FFT to start from brick decomp and skip remap + + remap->perform(cfft,cfft,remap_buf); + +#if DEBUG_AMOEBA + debug_scalar(FFT,"PRE Convo / POST Remap"); + debug_file(FFT,"pre.convo.post.remap"); +#endif + // perform forward FFT + + fft1->compute(cfft,cfft,FFT3d::FORWARD); + + if (SCALE) { + double scale = 1.0/nfft_global; + for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; + } + +#if DEBUG_AMOEBA + debug_scalar(CFFT1,"PRE Convo / POST FFT"); + debug_file(CFFT1,"pre.convo.post.fft"); +#endif + return cfft; +} + +/* ---------------------------------------------------------------------- + perform post-convolution grid operations for 4d cgrid_brick array +------------------------------------------------------------------------- */ + +void *AmoebaConvolutionGPU::post_convolution_4d() +{ + int ix,iy,iz,n; + + // perform backward FFT + +#if DEBUG_AMOEBA + debug_scalar(CFFT1,"POST Convo / PRE FFT"); + debug_file(CFFT1,"post.convo.pre.fft"); +#endif + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + +#if DEBUG_AMOEBA + debug_scalar(CFFT2,"POST Convo / POST FFT"); + debug_file(CFFT2,"post.convo.post.fft"); +#endif + // copy 1d complex values into 4d complex grid + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) { + cgrid_brick[iz][iy][ix][0] = cfft[n++]; + cgrid_brick[iz][iy][ix][1] = cfft[n++]; + } + + // forward comm to populate ghost grid values + +#if DEBUG_AMOEBA + debug_scalar(GRIDBRICK_IN,"POST Convo / PRE gridcomm"); + debug_file(GRIDBRICK_IN,"post.convo.pre.gridcomm"); +#endif + gc->forward_comm(GridComm::PAIR,amoeba,2,sizeof(FFT_SCALAR),which, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + + return (void *) cgrid_brick; +} diff --git a/src/GPU/amoeba_convolution_gpu.h b/src/GPU/amoeba_convolution_gpu.h new file mode 100644 index 0000000000..33c3a4aac1 --- /dev/null +++ b/src/GPU/amoeba_convolution_gpu.h @@ -0,0 +1,34 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H +#define LMP_AMOEBA_CONVOLUTION_GPU_H + +#include "amoeba_convolution.h" + + +namespace LAMMPS_NS { + +class AmoebaConvolutionGPU : public AmoebaConvolution { + public: + AmoebaConvolutionGPU(class LAMMPS *, class Pair *, + int, int, int, int, int); + + virtual FFT_SCALAR *pre_convolution_4d(); + virtual void *post_convolution_4d(); + +}; + +} + +#endif From f1112ab6b6225692df5e96ea685c33ca4c039adf Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 15 Aug 2022 14:28:46 -0500 Subject: [PATCH 093/181] Working on the gpu kspace induce term: dipole spreading and/or fft calls --- src/GPU/pair_amoeba_gpu.cpp | 135 +++++++++++++++++++++++++++++++++++- src/GPU/pair_amoeba_gpu.h | 2 + 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 582eb7b595..734ca53bba 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -18,7 +18,7 @@ #include "pair_amoeba_gpu.h" -#include "amoeba_convolution.h" +#include "amoeba_convolution_gpu.h" #include "atom.h" #include "comm.h" #include "domain.h" @@ -46,6 +46,8 @@ enum{GEAR,ASPC,LSQR}; enum{BUILD,APPLY}; enum{GORDON1,GORDON2}; +enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; + #define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye // External functions from cuda library for atom decomposition @@ -108,6 +110,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_dispersion_real_ready = false; // always false for AMOEBA gpu_multipole_real_ready = true; // need to be true for precompute() gpu_udirect2b_ready = true; + gpu_umutual1_ready = true; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; // need to be true for copying data from device back to host @@ -176,6 +179,17 @@ void PairAmoebaGPU::init_style() tq_single = false; else tq_single = true; + + // replace with the gpu counterpart + + if (gpu_umutual1_ready) { + if (use_ewald && ic_kspace) { + delete ic_kspace; + ic_kspace = + new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); + } + + } } /* ---------------------------------------------------------------------- @@ -924,6 +938,125 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) time_mutual_kspace += time2 - time1; } +/* ---------------------------------------------------------------------- + umutual1 = Ewald recip mutual induced field + umutual1 computes the reciprocal space contribution of the + induced atomic dipole moments to the field +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::umutual1(double **field, double **fieldp) +{ + int i,j,k,m,n; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + double term; + double a[3][3]; // indices not flipped vs Fortran + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // convert Cartesian dipoles to fractional coordinates + + for (j = 0; j < 3; j++) { + a[0][j] = nfft1 * recip[0][j]; + a[1][j] = nfft2 * recip[1][j]; + a[2][j] = nfft3 * recip[2][j]; + } + + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; + fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; + } + } + + // gridpre = my portion of 4d grid in brick decomp w/ ghost values + + double ****gridpre = (double ****) ic_kspace->zero(); + + // map 2 values to grid + + grid_uind(fuind,fuinp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + double *gridfft = ic_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + nxlo = ic_kspace->nxlo_fft; + nxhi = ic_kspace->nxhi_fft; + nylo = ic_kspace->nylo_fft; + nyhi = ic_kspace->nyhi_fft; + nzlo = ic_kspace->nzlo_fft; + nzhi = ic_kspace->nzhi_fft; + + // use qfac values stored in udirect1() + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + double ****gridpost = (double ****) ic_kspace->post_convolution(); + + // get potential + + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + + // store fractional reciprocal potentials for OPT method + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) { + fopt[i][optlevel][j] = fdip_phi1[i][j]; + foptp[i][optlevel][j] = fdip_phi2[i][j]; + } + } + } + + // convert the dipole fields from fractional to Cartesian + + for (i = 0; i < 3; i++) { + a[0][i] = nfft1 * recip[0][i]; + a[1][i] = nfft2 * recip[1][i]; + a[2][i] = nfft3 * recip[2][i]; + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] + + a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3]; + dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] + + a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3]; + } + } + + // increment the field at each multipole site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + field[i][j] -= dipfield1[i][j]; + fieldp[i][j] -= dipfield2[i][j]; + } + } +} + /* ---------------------------------------------------------------------- umutual2b = Ewald real mutual field via list umutual2b computes the real space contribution of the induced diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index e419ccd1a1..e0563cd8b5 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -38,6 +38,7 @@ class PairAmoebaGPU : public PairAmoeba { //virtual void dispersion_real(); virtual void multipole_real(); virtual void udirect2b(double **, double **); + virtual void umutual1(double **, double **); virtual void umutual2b(double **, double **); virtual void ufield0c(double **, double **); virtual void polar_real(); @@ -54,6 +55,7 @@ class PairAmoebaGPU : public PairAmoeba { bool gpu_dispersion_real_ready; bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; + bool gpu_umutual1_ready; bool gpu_umutual2b_ready; bool gpu_polar_real_ready; From 46b8b00a4faf716c1bad0139a37461138c572094 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 15 Aug 2022 15:51:43 -0500 Subject: [PATCH 094/181] Working on fft on the device --- lib/gpu/lal_amoeba_ext.cpp | 4 ++++ lib/gpu/lal_base_amoeba.cpp | 22 ++++++++++++++++++++++ lib/gpu/lal_base_amoeba.h | 5 ++++- src/GPU/amoeba_convolution_gpu.cpp | 5 ++++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 63ed683833..be183b284d 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } +void amoeba_compute_fft1d(void** in, void** out, const int mode) { + AMOEBAMF.compute_fft1d(in, out, mode); +} + double amoeba_gpu_bytes() { return AMOEBAMF.host_memory_usage(); } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 3b67ee31a1..b0d6ecee68 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -568,12 +568,30 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, _tep.update_host(_max_tep_size*4,false); } +// --------------------------------------------------------------------------- +// Return the memory bytes allocated on the host and device +// --------------------------------------------------------------------------- + template double BaseAmoebaT::host_memory_usage_atomic() const { return device->atom.host_memory_usage()+nbor->host_memory_usage()+ 4*sizeof(numtyp)+sizeof(BaseAmoeba); } +// --------------------------------------------------------------------------- +// Compute FFT +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode) +{ + +} + +// --------------------------------------------------------------------------- +// Copy the extra data from host to device +// --------------------------------------------------------------------------- + template void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, double** uind, double** uinp, double* pval) { @@ -645,6 +663,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, } } +// --------------------------------------------------------------------------- +// Compile (load) the kernel strings and set the kernels +// --------------------------------------------------------------------------- + template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_multipole, diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index f439e2945f..cf767be96e 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -189,7 +189,10 @@ class BaseAmoeba { // _fieldp store both arrays, one after another _fieldp.update_host(_max_fieldp_size*8,false); } - + + /// compute forward/backward FFT on the device + void compute_fft1d(void** in, void** out, const int mode); + // -------------------------- DEVICE DATA ------------------------- /// Device Properties and Atom and Neighbor storage diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp index 976a115fe1..ad52df3d4b 100644 --- a/src/GPU/amoeba_convolution_gpu.cpp +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -23,7 +23,8 @@ using namespace LAMMPS_NS; // External functions from GPU library -//int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int size, const int flag); +int amoeba_setup_fft(const int size); +int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode); /* ---------------------------------------------------------------------- partition an FFT grid across processors @@ -39,6 +40,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair, AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller, which_caller) { + } /* ---------------------------------------------------------------------- @@ -81,6 +83,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() debug_scalar(FFT,"PRE Convo / POST Remap"); debug_file(FFT,"pre.convo.post.remap"); #endif + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); From 28dabb9687db9f30232cde5929debba6b8ef0396 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 16 Aug 2022 15:37:49 -0500 Subject: [PATCH 095/181] Cleaned up unused variables in the amoeba kernels, made room for convolution gpu --- lib/gpu/lal_amoeba.cu | 68 +++--------------------------- lib/gpu/lal_amoeba_ext.cpp | 4 ++ lib/gpu/lal_base_amoeba.cpp | 12 +++++- lib/gpu/lal_base_amoeba.h | 5 +++ src/GPU/amoeba_convolution_gpu.cpp | 9 +++- src/GPU/pair_amoeba_gpu.cpp | 3 +- 6 files changed, 34 insertions(+), 67 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 6f0c7c8433..1b2900f97f 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -515,8 +515,8 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, const numtyp4 pol3j = polar3[j]; numtyp qkyz = pol3j.x; // rpole[j][9]; numtyp qkzz = pol3j.y; // rpole[j][12]; - int jtype = pol3j.z; // amtype[j]; - int jgroup = pol3j.w; // amgroup[j]; + //int jtype = pol3j.z; // amtype[j]; + //int jgroup = pol3j.w; // amgroup[j]; const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)]; numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)]; @@ -546,18 +546,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp dirx = diy*zr - diz*yr; numtyp diry = diz*xr - dix*zr; numtyp dirz = dix*yr - diy*xr; - numtyp dkrx = dky*zr - dkz*yr; - numtyp dkry = dkz*xr - dkx*zr; - numtyp dkrz = dkx*yr - dky*xr; numtyp dikx = diy*dkz - diz*dky; numtyp diky = diz*dkx - dix*dkz; numtyp dikz = dix*dky - diy*dkx; numtyp qirx = qiz*yr - qiy*zr; numtyp qiry = qix*zr - qiz*xr; numtyp qirz = qiy*xr - qix*yr; - numtyp qkrx = qkz*yr - qky*zr; - numtyp qkry = qkx*zr - qkz*xr; - numtyp qkrz = qky*xr - qkx*yr; numtyp qikx = qky*qiz - qkz*qiy; numtyp qiky = qkz*qix - qkx*qiz; numtyp qikz = qkx*qiy - qky*qix; @@ -570,18 +564,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp qikrx = qizk*yr - qiyk*zr; numtyp qikry = qixk*zr - qizk*xr; numtyp qikrz = qiyk*xr - qixk*yr; - numtyp qkirx = qkzi*yr - qkyi*zr; - numtyp qkiry = qkxi*zr - qkzi*xr; - numtyp qkirz = qkyi*xr - qkxi*yr; numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz; numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz; numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz; numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz; numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz; numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz; - numtyp diqkrx = diqkz*yr - diqky*zr; - numtyp diqkry = diqkx*zr - diqkz*xr; - numtyp diqkrz = diqky*xr - diqkx*yr; numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; @@ -735,7 +723,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - local_allocate_store_charge(); + //local_allocate_store_charge(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; @@ -751,8 +739,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //numtyp qtmp; fetch(qtmp,i,q_tex); - //int itype=ix.w; // recalculate numj and nbor_end for use of the short nbor list if (dev_packed==dev_nbor) { @@ -762,21 +748,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, nbor_mem = dev_short_nbor; } - //numtyp bn[4],bcn[3]; - //numtyp fid[3],fip[3]; - - const numtyp4 pol1i = polar1[i]; - numtyp dix = pol1i.y; // rpole[i][1]; - numtyp diy = pol1i.z; // rpole[i][2]; - numtyp diz = pol1i.w; // rpole[i][3]; - const numtyp4 pol2i = polar2[i]; - numtyp qixx = pol2i.x; // rpole[i][4]; - numtyp qixy = pol2i.y; // rpole[i][5]; - numtyp qixz = pol2i.z; // rpole[i][6]; - numtyp qiyy = pol2i.w; // rpole[i][8]; const numtyp4 pol3i = polar3[i]; - numtyp qiyz = pol3i.x; // rpole[i][9]; - numtyp qizz = pol3i.y; // rpole[i][12]; int itype = pol3i.z; // amtype[i]; int igroup = pol3i.w; // amgroup[i]; @@ -843,11 +815,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, // intermediates involving moments and separation distance - numtyp dir = dix*xr + diy*yr + diz*zr; - numtyp qix = qixx*xr + qixy*yr + qixz*zr; - numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; - numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; - numtyp qir = qix*xr + qiy*yr + qiz*zr; numtyp dkr = dkx*xr + dky*yr + dkz*zr; numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; @@ -959,7 +926,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - local_allocate_store_charge(); + //local_allocate_store_charge(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; @@ -977,8 +944,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //numtyp qtmp; fetch(qtmp,i,q_tex); - //int itype=ix.w; // recalculate numj and nbor_end for use of the short nbor list if (dev_packed==dev_nbor) { @@ -989,9 +954,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, } int itype,igroup; - //numtyp bn[4],bcn[3]; - //numtyp fid[3],fip[3]; - itype = polar3[i].z; // amtype[i]; igroup = polar3[i].w; // amgroup[i]; @@ -1008,7 +970,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, int j = jextra & NEIGHMASK15; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; - //int jtype=jx.w; // Compute r12 numtyp xr = jx.x - ix.x; @@ -1171,23 +1132,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, if (ii +void BaseAmoebaT::setup_fft(const int size, const int element_type) +{ + +} + +// --------------------------------------------------------------------------- +// Compute FFT on the device // --------------------------------------------------------------------------- template diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index cf767be96e..2bff362f29 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -190,7 +190,12 @@ class BaseAmoeba { _fieldp.update_host(_max_fieldp_size*8,false); } + /// setup a plan for FFT, where size is the number of elements + + void setup_fft(const int size, const int element_type=0); + /// compute forward/backward FFT on the device + void compute_fft1d(void** in, void** out, const int mode); // -------------------------- DEVICE DATA ------------------------- diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp index ad52df3d4b..f514a50620 100644 --- a/src/GPU/amoeba_convolution_gpu.cpp +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -21,10 +21,12 @@ using namespace LAMMPS_NS; #define SCALE 0 +enum {FORWARD,BACKWARD}; + // External functions from GPU library -int amoeba_setup_fft(const int size); -int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode); +int amoeba_setup_fft(const int size, const int element_type); +int amoeba_compute_fft1d(void* in, void* out, const int mode); /* ---------------------------------------------------------------------- partition an FFT grid across processors @@ -64,6 +66,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm"); debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm"); #endif + // copy owned 4d brick grid values to FFT grid n = 0; @@ -88,6 +91,8 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() fft1->compute(cfft,cfft,FFT3d::FORWARD); + //amoeba_compute_fft1d(cfft,cfft,FORWARD); + if (SCALE) { double scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 734ca53bba..29db1b4c1b 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -38,6 +38,7 @@ using namespace LAMMPS_NS; using namespace MathConst; +// same as in amoeba_induce.cpp enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; @@ -46,6 +47,7 @@ enum{GEAR,ASPC,LSQR}; enum{BUILD,APPLY}; enum{GORDON1,GORDON2}; +// same as in pair_amoeba.cpp enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; #define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye @@ -188,7 +190,6 @@ void PairAmoebaGPU::init_style() ic_kspace = new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); } - } } From 921796a15f012659aa0b0bca57be71b547ec905f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 16 Aug 2022 16:29:38 -0500 Subject: [PATCH 096/181] Cleaned up unused variables in the hippo kernels --- lib/gpu/lal_hippo.cu | 152 ++++++------------------------------------- 1 file changed, 20 insertions(+), 132 deletions(-) diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 5b88ac4955..be8d2c0701 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -467,7 +467,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, } const numtyp4 pol1i = polar1[i]; - numtyp ci = pol1i.x; // rpole[i][0]; + //numtyp ci = pol1i.x; // rpole[i][0]; numtyp dix = pol1i.y; // rpole[i][1]; numtyp diy = pol1i.z; // rpole[i][2]; numtyp diz = pol1i.w; // rpole[i][3]; @@ -501,7 +501,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, if (r2>off2) continue; const numtyp4 pol1j = polar1[j]; - numtyp ck = pol1j.x; // rpole[j][0]; + //numtyp ck = pol1j.x; // rpole[j][0]; numtyp dkx = pol1j.y; // rpole[j][1]; numtyp dky = pol1j.z; // rpole[j][2]; numtyp dkz = pol1j.w; // rpole[j][3]; @@ -548,18 +548,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp dirx = diy*zr - diz*yr; numtyp diry = diz*xr - dix*zr; numtyp dirz = dix*yr - diy*xr; - numtyp dkrx = dky*zr - dkz*yr; - numtyp dkry = dkz*xr - dkx*zr; - numtyp dkrz = dkx*yr - dky*xr; numtyp dikx = diy*dkz - diz*dky; numtyp diky = diz*dkx - dix*dkz; numtyp dikz = dix*dky - diy*dkx; numtyp qirx = qiz*yr - qiy*zr; numtyp qiry = qix*zr - qiz*xr; numtyp qirz = qiy*xr - qix*yr; - numtyp qkrx = qkz*yr - qky*zr; - numtyp qkry = qkx*zr - qkz*xr; - numtyp qkrz = qky*xr - qkx*yr; numtyp qikx = qky*qiz - qkz*qiy; numtyp qiky = qkz*qix - qkx*qiz; numtyp qikz = qkx*qiy - qky*qix; @@ -572,18 +566,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, numtyp qikrx = qizk*yr - qiyk*zr; numtyp qikry = qixk*zr - qizk*xr; numtyp qikrz = qiyk*xr - qixk*yr; - numtyp qkirx = qkzi*yr - qkyi*zr; - numtyp qkiry = qkxi*zr - qkzi*xr; - numtyp qkirz = qkyi*xr - qkxi*yr; numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz; numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz; numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz; numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz; numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz; numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz; - numtyp diqkrx = diqkz*yr - diqky*zr; - numtyp diqkry = diqkx*zr - diqkz*xr; - numtyp diqkrz = diqky*xr - diqkx*yr; numtyp dkqirx = dkqiz*yr - dkqiy*zr; numtyp dkqiry = dkqix*zr - dkqiz*xr; numtyp dkqirz = dkqiy*xr - dkqix*yr; @@ -768,8 +756,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //numtyp qtmp; fetch(qtmp,i,q_tex); - //int itype=ix.w; // recalculate numj and nbor_end for use of the short nbor list if (dev_packed==dev_nbor) { @@ -955,10 +941,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, if (ii Date: Tue, 23 Aug 2022 15:42:05 -0500 Subject: [PATCH 097/181] First attempt to port the forward FFT in the k-space induce term to the GPU, not working yet --- lib/gpu/Makefile.lammps.standard | 2 +- lib/gpu/lal_amoeba_ext.cpp | 8 +-- lib/gpu/lal_base_amoeba.cpp | 84 +++++++++++++++++++++++++++++- lib/gpu/lal_base_amoeba.h | 21 +++++++- src/AMOEBA/amoeba_convolution.cpp | 38 +++++++++++++- src/AMOEBA/amoeba_convolution.h | 2 + src/AMOEBA/pair_amoeba.cpp | 10 ++++ src/GPU/amoeba_convolution_gpu.cpp | 30 +++++++++-- 8 files changed, 181 insertions(+), 14 deletions(-) diff --git a/lib/gpu/Makefile.lammps.standard b/lib/gpu/Makefile.lammps.standard index 9526e8e373..0bb3394b3e 100644 --- a/lib/gpu/Makefile.lammps.standard +++ b/lib/gpu/Makefile.lammps.standard @@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda endif gpu_SYSINC = -gpu_SYSLIB = -lcudart -lcuda +gpu_SYSLIB = -lcudart -lcuda -lcufft gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 304159e571..7d9d836b29 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,12 +162,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } -void amoeba_setup_fft(const int size, const int element_type) { - AMOEBAMF.setup_fft(size, element_type); +void amoeba_setup_fft(const int numel, const int element_type) { + AMOEBAMF.setup_fft(numel, element_type); } -void amoeba_compute_fft1d(void** in, void** out, const int mode) { - AMOEBAMF.compute_fft1d(in, out, mode); +void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) { + AMOEBAMF.compute_fft1d(in, out, numel, mode); } double amoeba_gpu_bytes() { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 05a48f9588..2f3c04c7f1 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -15,6 +15,7 @@ ***************************************************************************/ #include "lal_base_amoeba.h" + namespace LAMMPS_AL { #define BaseAmoebaT BaseAmoeba @@ -39,6 +40,9 @@ BaseAmoebaT::~BaseAmoeba() { k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); + + //if (cufft_plan_created) cufftDestroy(plan); + if (pair_program) delete pair_program; } @@ -137,11 +141,15 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _max_fieldp_size = _max_tep_size; _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _max_thetai_size = 0; + _nmax = nall; dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + cufft_plan_created = false; + return success; } @@ -169,6 +177,9 @@ void BaseAmoebaT::clear_atomic() { _tep.clear(); _fieldp.clear(); + _thetai1.clear(); + _thetai2.clear(); + _thetai3.clear(); dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -422,6 +433,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall return nbor->host_jlist.begin()-host_start; } +// --------------------------------------------------------------------------- +// Prepare for umutual1: bspline_fill +// - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed +// - transfer extra data from host to device +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall, + const int bsordermax, double **host_x, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, void* grid) { + + _bsordermax = bsordermax; + + if (_max_thetai_size == 0) { + _max_thetai_size = static_cast(static_cast(inum_full)*1.10); + _thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + } else { + if (inum_full>_max_thetai_size) { + _max_thetai_size=static_cast(static_cast(inum_full)*1.10); + _thetai1.resize(_max_thetai_size*_bsordermax*4); + _thetai2.resize(_max_thetai_size*_bsordermax*4); + _thetai3.resize(_max_thetai_size*_bsordermax*4); + } + } + +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- @@ -583,7 +624,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const { // --------------------------------------------------------------------------- template -void BaseAmoebaT::setup_fft(const int size, const int element_type) +void BaseAmoebaT::setup_fft(const int numel, const int element_type) { } @@ -593,9 +634,48 @@ void BaseAmoebaT::setup_fft(const int size, const int element_type) // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode) +void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode) { + if (cufft_plan_created == false) { + int m = numel/2; + cufftPlan1d(&plan, m, CUFFT_Z2Z, 1); + cufft_plan_created = true; + } + // n = number of double complex + int n = numel/2; + + // copy the host array to the device (data) + UCL_Vector data; + data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE); + int m = 0; + double* d_in = (double*)in; + for (int i = 0; i < n; i++) { + data[i].x = d_in[m]; + data[i].y = d_in[m+1]; + m += 2; + } + data.update_device(false); + + // perform the in-place forward FFT + + cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device, + (cufftDoubleComplex*)&data.device, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result); + ucl_device->sync(); + data.update_host(false); + + // copy back the data to the host array + + m = 0; + double* d_out = (double*)out; + for (int i = 0; i < n; i++) { + d_out[m] = data[i].x; + d_out[m+1] = data[i].y; + m += 2; + } + + data.clear(); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 2bff362f29..3d0b3ab1a4 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -31,6 +31,14 @@ #include "geryon/nvd_texture.h" #endif +#if !defined(USE_OPENCL) && !defined(USE_HIP) +// temporary workaround for int2 also defined in cufft +#ifdef int2 +#undef int2 +#endif +#include "cufft.h" +#endif + namespace LAMMPS_AL { template @@ -142,6 +150,11 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); + virtual void precompute_umutual1(const int ago, const int inum_full, const int nall, + const int bsordermax, double **host_x, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, void* grid); + /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -196,7 +209,7 @@ class BaseAmoeba { /// compute forward/backward FFT on the device - void compute_fft1d(void** in, void** out, const int mode); + void compute_fft1d(void* in, void* out, const int numel, const int mode); // -------------------------- DEVICE DATA ------------------------- @@ -230,6 +243,10 @@ class BaseAmoeba { UCL_Vector _tep, _fieldp; int _nmax, _max_tep_size, _max_fieldp_size; + int _bsordermax; + UCL_Vector _thetai1, _thetai2, _thetai3; + int _max_thetai_size; + // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; @@ -282,6 +299,8 @@ class BaseAmoeba { virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int polar_real(const int eflag, const int vflag) = 0; + cufftHandle plan; + bool cufft_plan_created; }; } diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index 9c8f728f99..4dde750c61 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -203,7 +203,7 @@ AmoebaConvolution::AmoebaConvolution(LAMMPS *lmp, Pair *pair, fft1 = new FFT3d(lmp,world,nx,ny,nz, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,&tmp,0); + 1,0,&tmp,0); // 0,0,&tmp,0); fft2 = new FFT3d(lmp,world,nx,ny,nz, @@ -358,15 +358,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d() cfft[n++] = ZEROF; } + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = MPI_Wtime(); if (SCALE) { double scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -414,15 +422,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d() debug_scalar(FFT,"PRE Convo / POST Remap"); debug_file(FFT,"pre.convo.post.remap"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = MPI_Wtime(); if (SCALE) { double scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -455,7 +472,16 @@ void *AmoebaConvolution::post_convolution_3d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); @@ -497,8 +523,18 @@ void *AmoebaConvolution::post_convolution_4d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); debug_file(CFFT2,"post.convo.post.fft"); diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h index 00f2b8ed91..8e7f09218a 100644 --- a/src/AMOEBA/amoeba_convolution.h +++ b/src/AMOEBA/amoeba_convolution.h @@ -47,6 +47,8 @@ class AmoebaConvolution : protected Pointers { FFT_SCALAR *pre_convolution(); void *post_convolution(); + double time_fft; + protected: int which; // caller name for convolution being performed int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index d5270af450..3b66ebc221 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -347,6 +347,10 @@ void PairAmoeba::compute(int eflag, int vflag) time_direct_rspace = time_direct_kspace = 0.0; time_mutual_rspace = time_mutual_kspace = 0.0; time_polar_rspace = time_polar_kspace = 0.0; + + if (ic_kspace) { + ic_kspace->time_fft = 0.0; + } } double time0,time1,time2,time3,time4,time5,time6,time7,time8; @@ -542,6 +546,10 @@ void PairAmoeba::finish() MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_polar_kspace = ave/comm->nprocs; + double time_mutual_fft = ic_kspace->time_fft; + MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_fft = ave/comm->nprocs; + double time_total = (time_init + time_hal + time_repulse + time_disp + time_mpole + time_induce + time_polar + time_qxfer) / 100.0; @@ -570,7 +578,9 @@ void PairAmoeba::finish() utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); + utils::logmesg(lmp," - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); + } } diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp index f514a50620..f9daa06e65 100644 --- a/src/GPU/amoeba_convolution_gpu.cpp +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -21,12 +21,13 @@ using namespace LAMMPS_NS; #define SCALE 0 -enum {FORWARD,BACKWARD}; +//#define USE_AMOEBA_FFT +#ifdef USE_AMOEBA_FFT // External functions from GPU library - -int amoeba_setup_fft(const int size, const int element_type); -int amoeba_compute_fft1d(void* in, void* out, const int mode); +int amoeba_setup_fft(const int size, const int numel, const int element_type); +int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode); +#endif /* ---------------------------------------------------------------------- partition an FFT grid across processors @@ -52,6 +53,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair, FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() { int ix,iy,iz,n; + double time0,time1; // reverse comm for 4d brick grid + ghosts @@ -87,11 +89,20 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() debug_file(FFT,"pre.convo.post.remap"); #endif + MPI_Barrier(world); + time0 = MPI_Wtime(); + // perform forward FFT + #ifdef USE_AMOEBA_FFT + amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD); + #else fft1->compute(cfft,cfft,FFT3d::FORWARD); + #endif - //amoeba_compute_fft1d(cfft,cfft,FORWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; if (SCALE) { double scale = 1.0/nfft_global; @@ -119,7 +130,16 @@ void *AmoebaConvolutionGPU::post_convolution_4d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); From b2d6df5bfbe44b7092ab4588539113b94cd34023 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 25 Aug 2022 23:18:13 -0500 Subject: [PATCH 098/181] Re-arranged some for loops in umutual1 to improve cache-friendly memory access; made placeholder for grid_uind on the GPU lib, maybe FFT is not that heavy to be put on the device. --- lib/gpu/lal_amoeba_ext.cpp | 5 +++ src/GPU/pair_amoeba_gpu.cpp | 87 ++++++++++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 7d9d836b29..6989a5e6f6 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,6 +162,11 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } +void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, double ***grid) { +} + void amoeba_setup_fft(const int numel, const int element_type) { AMOEBAMF.setup_fft(numel, element_type); } diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 29db1b4c1b..cd3c01cde3 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -88,6 +88,10 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void amoeba_gpu_update_fieldp(void **fieldp_ptr); +void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, + double** host_thetai1, double** host_thetai2, + double** host_thetai3, double ***grid); + void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const bool eflag, const bool vflag, const bool eatom, const bool vatom, @@ -869,7 +873,7 @@ void PairAmoebaGPU::udirect2b_cpu() void PairAmoebaGPU::ufield0c(double **field, double **fieldp) { - int i,j; + //int i,j; double term; double time0,time1,time2; @@ -879,13 +883,18 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) int nlocal = atom->nlocal; int nall = nlocal + atom->nghost; - for (i = 0; i < nall; i++) { - for (j = 0; j < 3; j++) { + memset(&field[0][0], 0, 3*nall *sizeof(double)); + memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); + +/* + for (int i = 0; i < nall; i++) { + for (int j = 0; j < 3; j++) { field[i][j] = 0.0; fieldp[i][j] = 0.0; } } - +*/ + // get the real space portion of the mutual field first MPI_Barrier(world); @@ -902,13 +911,24 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) // add the self-energy portion of the mutual field term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (int i = 0; i < nlocal; i++) { + field[i][0] += term*uind[i][0]; + field[i][1] += term*uind[i][1]; + field[i][2] += term*uind[i][2]; + } + for (int i = 0; i < nlocal; i++) { + fieldp[i][0] += term*uinp[i][0]; + fieldp[i][1] += term*uinp[i][1]; + fieldp[i][2] += term*uinp[i][2]; + } +/* for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { field[i][j] += term*uind[i][j]; fieldp[i][j] += term*uinp[i][j]; } } - +*/ // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU // field and fieldp may already have some nonzero values from kspace (umutual1 and self) @@ -947,7 +967,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) void PairAmoebaGPU::umutual1(double **field, double **fieldp) { - int i,j,k,m,n; + int m,n; int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; double term; double a[3][3]; // indices not flipped vs Fortran @@ -958,7 +978,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // convert Cartesian dipoles to fractional coordinates - for (j = 0; j < 3; j++) { + for (int j = 0; j < 3; j++) { a[0][j] = nfft1 * recip[0][j]; a[1][j] = nfft2 * recip[1][j]; a[2][j] = nfft3 * recip[2][j]; @@ -966,13 +986,25 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) int nlocal = atom->nlocal; + for (int i = 0; i < nlocal; i++) { + fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; + fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; + fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; + fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; + fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; + } +/* for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; } } - +*/ // gridpre = my portion of 4d grid in brick decomp w/ ghost values double ****gridpre = (double ****) ic_kspace->zero(); @@ -1000,9 +1032,9 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // use qfac values stored in udirect1() m = n = 0; - for (k = nzlo; k <= nzhi; k++) { - for (j = nylo; j <= nyhi; j++) { - for (i = nxlo; i <= nxhi; i++) { + for (int k = nzlo; k <= nzhi; k++) { + for (int j = nylo; j <= nyhi; j++) { + for (int i = nxlo; i <= nxhi; i++) { term = qfac[m++]; gridfft[n] *= term; gridfft[n+1] *= term; @@ -1023,8 +1055,8 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // store fractional reciprocal potentials for OPT method if (poltyp == OPT) { - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 10; j++) { + for (int i = 0; i < nlocal; i++) { + for (int j = 0; j < 10; j++) { fopt[i][optlevel][j] = fdip_phi1[i][j]; foptp[i][optlevel][j] = fdip_phi2[i][j]; } @@ -1033,13 +1065,37 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // convert the dipole fields from fractional to Cartesian - for (i = 0; i < 3; i++) { + for (int i = 0; i < 3; i++) { a[0][i] = nfft1 * recip[0][i]; a[1][i] = nfft2 * recip[1][i]; a[2][i] = nfft3 * recip[2][i]; } - for (i = 0; i < nlocal; i++) { + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi1[i][1] + + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; + double dfy = a[1][0]*fdip_phi1[i][1] + + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; + double dfz = a[2][0]*fdip_phi1[i][1] + + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; + field[i][0] -= dfx; + field[i][1] -= dfy; + field[i][2] -= dfz; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi2[i][1] + + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; + double dfy = a[1][0]*fdip_phi2[i][1] + + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; + double dfz = a[2][0]*fdip_phi2[i][1] + + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; + fieldp[i][0] -= dfx; + fieldp[i][1] -= dfy; + fieldp[i][2] -= dfz; + } +/* + for (int i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] + a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3]; @@ -1056,6 +1112,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) fieldp[i][j] -= dipfield2[i][j]; } } +*/ } /* ---------------------------------------------------------------------- From b160460dccaa440a2475b0bceb164e9181bd80f1 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 26 Aug 2022 12:55:46 -0500 Subject: [PATCH 099/181] Added preprocessors to comment out cufft entirely for now --- lib/gpu/Opencl.makefile | 2 +- lib/gpu/lal_base_amoeba.cpp | 14 +++++++++----- lib/gpu/lal_base_amoeba.h | 4 +++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 64a2161f85..d318da15dd 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -6,7 +6,7 @@ UCL_H = $(wildcard ./geryon/ucl*.h) OCL_H = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h # Headers for Host files -HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \ +HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \ lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \ lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \ lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 2f3c04c7f1..d552a53e5a 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -40,8 +40,10 @@ BaseAmoebaT::~BaseAmoeba() { k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); - - //if (cufft_plan_created) cufftDestroy(plan); + + #if !defined(USE_OPENCL) && !defined(USE_HIP) + if (fft_plan_created) cufftDestroy(plan); + #endif if (pair_program) delete pair_program; } @@ -148,7 +150,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); - cufft_plan_created = false; + fft_plan_created = false; return success; } @@ -636,10 +638,11 @@ void BaseAmoebaT::setup_fft(const int numel, const int element_type) template void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode) { - if (cufft_plan_created == false) { + #if !defined(USE_OPENCL) && !defined(USE_HIP) + if (fft_plan_created == false) { int m = numel/2; cufftPlan1d(&plan, m, CUFFT_Z2Z, 1); - cufft_plan_created = true; + fft_plan_created = true; } // n = number of double complex @@ -676,6 +679,7 @@ void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int } data.clear(); + #endif } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 3d0b3ab1a4..eb0eff1e8d 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -299,8 +299,10 @@ class BaseAmoeba { virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int polar_real(const int eflag, const int vflag) = 0; + #if !defined(USE_OPENCL) && !defined(USE_HIP) cufftHandle plan; - bool cufft_plan_created; + #endif + bool fft_plan_created; }; } From 9e7bbad4d4a9b276005075088b4e405ba5ee37c7 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 27 Aug 2022 13:19:52 -0500 Subject: [PATCH 100/181] Working on fphi_uind in the GPU lib --- lib/gpu/Nvidia.makefile | 18 +++++++++---- lib/gpu/lal_amoeba.cpp | 27 ++++++++++++++++++++ lib/gpu/lal_amoeba.h | 1 + lib/gpu/lal_base_amoeba.cpp | 50 +++++++++++++++++++++++++++++-------- lib/gpu/lal_base_amoeba.h | 21 +++++++++++----- lib/gpu/lal_hippo.cpp | 27 ++++++++++++++++++++ lib/gpu/lal_hippo.h | 1 + lib/gpu/lal_pppm.cu | 18 ++++++------- 8 files changed, 132 insertions(+), 31 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 56942d3f3c..768daff53a 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -1,9 +1,17 @@ +# Common headers for kernels +PRE1_H = lal_preprocessor.h lal_aux_fun1.h + # Headers for Geryon UCL_H = $(wildcard ./geryon/ucl*.h) NVD_H = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \ lal_pre_cuda_hip.h -ALL_H = $(NVD_H) $(wildcard ./lal_*.h) +# Headers for Host files +HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \ + lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \ + lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \ + lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H) + # Source files SRCS := $(wildcard ./lal_*.cpp) OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o)) @@ -54,13 +62,13 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \ $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin $(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h -$(OBJ_DIR)/%_cubin.h: lal_%.cu $(ALL_H) +$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) $(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu $(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@ # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H) +$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT @@ -77,10 +85,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp $(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu - $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu + $(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu - $(CUDA) -o $@ -c cudpp_mini/scan_app.cu + $(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini #endif # build libgpu.a diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 924a175cfe..498c55ceba 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -259,6 +259,33 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int AmoebaT::fphi_uind() { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); +/* + this->time_pair.start(); + + this->k_fphi_uind.set_size(GX,BX); + this->k_fphi_uind.run(); + this->time_pair.stop(); +*/ + + return GX; +} + // --------------------------------------------------------------------------- // Calculate the polar real-space term, returning tep // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index d12b79719f..005ea14fb9 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -91,6 +91,7 @@ class Amoeba : public BaseAmoeba { int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); + int fphi_uind(); int polar_real(const int eflag, const int vflag); }; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index d552a53e5a..88a2c87166 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -442,27 +442,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall // --------------------------------------------------------------------------- template -void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall, - const int bsordermax, double **host_x, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, void* grid) { +void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, int** host_igrid) { - _bsordermax = bsordermax; + _bsorder = bsorder; if (_max_thetai_size == 0) { _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); } else { if (inum_full>_max_thetai_size) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); - _thetai1.resize(_max_thetai_size*_bsordermax*4); - _thetai2.resize(_max_thetai_size*_bsordermax*4); - _thetai3.resize(_max_thetai_size*_bsordermax*4); + _thetai1.resize(_max_thetai_size*bsorder*4); + _thetai2.resize(_max_thetai_size*bsorder*4); + _thetai3.resize(_max_thetai_size*bsorder*4); + _igrid.resize(_max_thetai_size*4); } } + memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp)); + memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp)); + memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp)); + memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int)); + _thetai1.update_device(inum_full*bsorder*4,true); + _thetai2.update_device(inum_full*bsorder*4,true); + _thetai3.update_device(inum_full*bsorder*4,true); + _igrid.update_device(inum_full*4,true); } // --------------------------------------------------------------------------- @@ -575,6 +584,25 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // _fieldp.update_host(_max_fieldp_size*8,false); } +// --------------------------------------------------------------------------- +// fphi_uind = induced potential from grid +// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, int** igrid, + double ****host_grid, double **host_fdip_phi1, + double **host_fdip_phi2, double **host_fdip_sum_phi) +{ + // once allocation and transfers + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid); + + const int red_bllocks = fphi_uind(); +} + + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index eb0eff1e8d..68c3470977 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -150,10 +150,9 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - virtual void precompute_umutual1(const int ago, const int inum_full, const int nall, - const int bsordermax, double **host_x, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, void* grid); + virtual void precompute_induce(const int inum_full, const int bsorder, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, int** igrid); /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, @@ -177,6 +176,12 @@ class BaseAmoeba { double **host_uind, double **host_uinp, double *host_pval, const double aewald, const double off2_polar, void **fieldp_ptr); + virtual void compute_fphi_uind(const int inum_full, const int bsorder, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, int** igrid, + double ****host_grid, double **host_fdip_phi1, + double **host_fdip_phi2, double **host_fdip_sum_phi); + /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, @@ -243,8 +248,9 @@ class BaseAmoeba { UCL_Vector _tep, _fieldp; int _nmax, _max_tep_size, _max_fieldp_size; - int _bsordermax; - UCL_Vector _thetai1, _thetai2, _thetai3; + int _bsorder; + UCL_Vector _thetai1, _thetai2, _thetai3; + UCL_Vector _igrid; int _max_thetai_size; // ------------------------ FORCE/ENERGY DATA ----------------------- @@ -297,8 +303,11 @@ class BaseAmoeba { virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; + virtual int fphi_uind() = 0; virtual int polar_real(const int eflag, const int vflag) = 0; + + #if !defined(USE_OPENCL) && !defined(USE_HIP) cufftHandle plan; #endif diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 79a8772c3e..d980ae0ed6 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -592,6 +592,33 @@ int HippoT::umutual2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int HippoT::fphi_uind() { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); +/* + this->time_pair.start(); + + this->k_fphi_uind.set_size(GX,BX); + this->k_fphi_uind.run(); + this->time_pair.stop(); +*/ + + return GX; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index 492712eb85..cece72caac 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -157,6 +157,7 @@ class Hippo : public BaseAmoeba { int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); + int fphi_uind(); int polar_real(const int eflag, const int vflag); }; diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index e17df5b88c..a8e929efe4 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -273,19 +273,19 @@ __kernel void interp(const __global numtyp4 *restrict x_, int my=mz+fast_mul(ny,npts_x); for (int m=0; m Date: Mon, 29 Aug 2022 00:13:30 -0500 Subject: [PATCH 101/181] Adding fphi_uind kernel, working on the arrays allocation --- lib/gpu/Nvidia.makefile | 2 +- lib/gpu/lal_amoeba.cu | 268 ++++++++++++++++++++++++++++++++++++ lib/gpu/lal_base_amoeba.cpp | 13 +- lib/gpu/lal_base_amoeba.h | 2 +- 4 files changed, 279 insertions(+), 6 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 768daff53a..5f50486e28 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -62,7 +62,7 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \ $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin $(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h -$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) +$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) $(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu $(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@ diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 1b2900f97f..1239764108 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1615,6 +1615,274 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +__kernel void k_fphi_uind(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict thetai1, + const __global numtyp4 *restrict thetai2, + const __global numtyp4 *restrict thetai3, + const __global int4 *restrict igrid, + const __global numtyp4 *restrict grid, + __global numtyp4 *restrict fdip_phi1, + __global numtyp4 *restrict fdip_phi2, + __global numtyp4 *restrict fdip_sum_phi, + const int bsorder, const int inum, + const int t_per_atom) +{ + int tid, ii, offset, i, n_stride; + atom_info(t_per_atom,ii,tid,offset); + + if (iiucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); } else { if (inum_full>_max_thetai_size) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); _thetai1.resize(_max_thetai_size*bsorder*4); _thetai2.resize(_max_thetai_size*bsorder*4); _thetai3.resize(_max_thetai_size*bsorder*4); - _igrid.resize(_max_thetai_size*4); + _igrid.resize(_max_thetai_size); } } @@ -471,7 +471,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai1.update_device(inum_full*bsorder*4,true); _thetai2.update_device(inum_full*bsorder*4,true); _thetai3.update_device(inum_full*bsorder*4,true); - _igrid.update_device(inum_full*4,true); + _igrid.update_device(inum_full,true); } // --------------------------------------------------------------------------- @@ -593,12 +593,17 @@ template void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, double **host_thetai3, int** igrid, - double ****host_grid, double **host_fdip_phi1, + double ****host_cgrid_brick, double **host_fdip_phi1, double **host_fdip_phi2, double **host_fdip_sum_phi) { // once allocation and transfers precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid); + // resize grid if needed, then copy from host to device + // cgrid_brick.alloc()/resize() + // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0]; + // + const int red_bllocks = fphi_uind(); } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 68c3470977..f333bdf9a6 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -250,7 +250,7 @@ class BaseAmoeba { int _bsorder; UCL_Vector _thetai1, _thetai2, _thetai3; - UCL_Vector _igrid; + UCL_Vector _igrid; int _max_thetai_size; // ------------------------ FORCE/ENERGY DATA ----------------------- From aac264f2e27b9c7db7748c627e143a65afda8db1 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 30 Aug 2022 23:40:04 -0500 Subject: [PATCH 102/181] Working on the fphi_uind kernel and array allocations --- lib/gpu/Nvidia.makefile | 26 +++- lib/gpu/lal_amoeba.cpp | 13 +- lib/gpu/lal_amoeba.cu | 265 +++++++++++++++++++++--------------- lib/gpu/lal_base_amoeba.cpp | 106 +++++++++++---- lib/gpu/lal_base_amoeba.h | 21 ++- 5 files changed, 283 insertions(+), 148 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 5f50486e28..c52246b06b 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -68,7 +68,31 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H) +$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) + $(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 498c55ceba..38058bab55 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -275,13 +275,18 @@ int AmoebaT::fphi_uind() { const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); -/* - this->time_pair.start(); + this->time_pair.start(); + int ngridyz = this->_ngridy * this->_ngridz; this->k_fphi_uind.set_size(GX,BX); - this->k_fphi_uind.run(); + this->k_fphi_uind.run(&this->atom->x, &this->_thetai1, + &this->_thetai2, &this->_thetai3, + &this->_igrid, &this->_cgrid_brick, + &this->_fdip_phi1, &this->_fdip_phi2, + &this->_fdip_sum_phi, &this->_bsorder, + &ainum, &ngridyz, &this->_ngridy, + &this->_threads_per_atom); this->time_pair.stop(); -*/ return GX; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 1239764108..984154f16e 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1621,15 +1621,16 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict thetai1, - const __global numtyp4 *restrict thetai2, - const __global numtyp4 *restrict thetai3, - const __global int4 *restrict igrid, - const __global numtyp4 *restrict grid, - __global numtyp4 *restrict fdip_phi1, - __global numtyp4 *restrict fdip_phi2, - __global numtyp4 *restrict fdip_sum_phi, + const __global numtyp *restrict thetai1, + const __global numtyp *restrict thetai2, + const __global numtyp *restrict thetai3, + const __global int *restrict igrid, + const __global numtyp *restrict grid, + __global numtyp *restrict fdip_phi1, + __global numtyp *restrict fdip_phi2, + __global numtyp *restrict fdip_sum_phi, const int bsorder, const int inum, + const int nyzgrid, const int nygrid, const int t_per_atom) { int tid, ii, offset, i, n_stride; @@ -1666,46 +1667,46 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, // extract the permanent multipole field at each site - tuv100_1 = 0.0; - tuv010_1 = 0.0; - tuv001_1 = 0.0; - tuv200_1 = 0.0; - tuv020_1 = 0.0; - tuv002_1 = 0.0; - tuv110_1 = 0.0; - tuv101_1 = 0.0; - tuv011_1 = 0.0; - tuv100_2 = 0.0; - tuv010_2 = 0.0; - tuv001_2 = 0.0; - tuv200_2 = 0.0; - tuv020_2 = 0.0; - tuv002_2 = 0.0; - tuv110_2 = 0.0; - tuv101_2 = 0.0; - tuv011_2 = 0.0; - tuv000 = 0.0; - tuv001 = 0.0; - tuv010 = 0.0; - tuv100 = 0.0; - tuv200 = 0.0; - tuv020 = 0.0; - tuv002 = 0.0; - tuv110 = 0.0; - tuv101 = 0.0; - tuv011 = 0.0; - tuv300 = 0.0; - tuv030 = 0.0; - tuv003 = 0.0; - tuv210 = 0.0; - tuv201 = 0.0; - tuv120 = 0.0; - tuv021 = 0.0; - tuv102 = 0.0; - tuv012 = 0.0; - tuv111 = 0.0; + tuv100_1 = (numtyp)0.0; + tuv010_1 = (numtyp)0.0; + tuv001_1 = (numtyp)0.0; + tuv200_1 = (numtyp)0.0; + tuv020_1 = (numtyp)0.0; + tuv002_1 = (numtyp)0.0; + tuv110_1 = (numtyp)0.0; + tuv101_1 = (numtyp)0.0; + tuv011_1 = (numtyp)0.0; + tuv100_2 = (numtyp)0.0; + tuv010_2 = (numtyp)0.0; + tuv001_2 = (numtyp)0.0; + tuv200_2 = (numtyp)0.0; + tuv020_2 = (numtyp)0.0; + tuv002_2 = (numtyp)0.0; + tuv110_2 = (numtyp)0.0; + tuv101_2 = (numtyp)0.0; + tuv011_2 = (numtyp)0.0; + tuv000 = (numtyp)0.0; + tuv001 = (numtyp)0.0; + tuv010 = (numtyp)0.0; + tuv100 = (numtyp)0.0; + tuv200 = (numtyp)0.0; + tuv020 = (numtyp)0.0; + tuv002 = (numtyp)0.0; + tuv110 = (numtyp)0.0; + tuv101 = (numtyp)0.0; + tuv011 = (numtyp)0.0; + tuv300 = (numtyp)0.0; + tuv030 = (numtyp)0.0; + tuv003 = (numtyp)0.0; + tuv210 = (numtyp)0.0; + tuv201 = (numtyp)0.0; + tuv120 = (numtyp)0.0; + tuv021 = (numtyp)0.0; + tuv102 = (numtyp)0.0; + tuv012 = (numtyp)0.0; + tuv111 = (numtyp)0.0; - k = igrid[i].z - nlpts; + k = igrid[3*i+2] - nlpts; for (int kb = 0; kb < bsorder; kb++) { /* v0 = thetai3[m][kb][0]; @@ -1713,30 +1714,35 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, v2 = thetai3[m][kb][2]; v3 = thetai3[m][kb][3]; */ - tu00_1 = 0.0; - tu01_1 = 0.0; - tu10_1 = 0.0; - tu20_1 = 0.0; - tu11_1 = 0.0; - tu02_1 = 0.0; - tu00_2 = 0.0; - tu01_2 = 0.0; - tu10_2 = 0.0; - tu20_2 = 0.0; - tu11_2 = 0.0; - tu02_2 = 0.0; - tu00 = 0.0; - tu10 = 0.0; - tu01 = 0.0; - tu20 = 0.0; - tu11 = 0.0; - tu02 = 0.0; - tu30 = 0.0; - tu21 = 0.0; - tu12 = 0.0; - tu03 = 0.0; + int i3 = m*4*bsorder + 4*kb; + v0 = thetai3[i3]; + v1 = thetai3[i3]+1; + v2 = thetai3[i3+2]; + v3 = thetai3[i3+3]; + tu00_1 = (numtyp)0.0; + tu01_1 = (numtyp)0.0; + tu10_1 = (numtyp)0.0; + tu20_1 = (numtyp)0.0; + tu11_1 = (numtyp)0.0; + tu02_1 = (numtyp)0.0; + tu00_2 = (numtyp)0.0; + tu01_2 = (numtyp)0.0; + tu10_2 = (numtyp)0.0; + tu20_2 = (numtyp)0.0; + tu11_2 = (numtyp)0.0; + tu02_2 = (numtyp)0.0; + tu00 = (numtyp)0.0; + tu10 = (numtyp)0.0; + tu01 = (numtyp)0.0; + tu20 = (numtyp)0.0; + tu11 = (numtyp)0.0; + tu02 = (numtyp)0.0; + tu30 = (numtyp)0.0; + tu21 = (numtyp)0.0; + tu12 = (numtyp)0.0; + tu03 = (numtyp)0.0; - j = igrid[i].y - nlpts; + j = igrid[3*i+1] - nlpts; for (int jb = 0; jb < bsorder; jb++) { /* u0 = thetai2[m][jb][0]; @@ -1744,19 +1750,24 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, u2 = thetai2[m][jb][2]; u3 = thetai2[m][jb][3]; */ - t0_1 = 0.0; - t1_1 = 0.0; - t2_1 = 0.0; - t0_2 = 0.0; - t1_2 = 0.0; - t2_2 = 0.0; - t3 = 0.0; + int i2 = m*4*bsorder+4*jb; + u0 = thetai2[i2]; + u1 = thetai2[i2+1]; + u2 = thetai2[i2+2]; + u3 = thetai2[i2+3]; + t0_1 = (numtyp)0.0; + t1_1 = (numtyp)0.0; + t2_1 = (numtyp)0.0; + t0_2 = (numtyp)0.0; + t1_2 = (numtyp)0.0; + t2_2 = (numtyp)0.0; + t3 = (numtyp)0.0; - i = igrid[m].x - nlpts; + int ii = igrid[3*i] - nlpts; for (int ib = 0; ib < bsorder; ib++) { /* - tq_1 = grid[k][j][i][0]; - tq_2 = grid[k][j][i][1]; + tq_1 = grid[k][j][ii][0]; + tq_2 = grid[k][j][ii][1]; t0_1 += tq_1*thetai1[m][ib][0]; t1_1 += tq_1*thetai1[m][ib][1]; t2_1 += tq_1*thetai1[m][ib][2]; @@ -1765,7 +1776,22 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 += tq_2*thetai1[m][ib][2]; t3 += (tq_1+tq_2)*thetai1[m][ib][3]; */ - i++; + int i1 = m*4*bsorder+4*ib; + numtyp w0 = thetai1[i1]; + numtyp w1 = thetai1[i1+1]; + numtyp w2 = thetai1[i1+2]; + numtyp w3 = thetai1[i1+3]; + int gidx = 2*(k*nyzgrid + j*nygrid + ii); + tq_1 = grid[gidx]; + tq_2 = grid[gidx+1]; + t0_1 += tq_1*w0; + t1_1 += tq_1*w1; + t2_1 += tq_1*w2; + t0_2 += tq_2*w0; + t1_2 += tq_2*w1; + t2_2 += tq_2*w2; + t3 += (tq_1+tq_2)*w3; + ii++; } tu00_1 += t0_1*u0; @@ -1836,6 +1862,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, tuv111 += tu11*v1; k++; } + /* fdip_phi1[m][0] = 0.0; fdip_phi1[m][1] = tuv100_1; @@ -1847,39 +1874,51 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, fdip_phi1[m][7] = tuv110_1; fdip_phi1[m][8] = tuv101_1; fdip_phi1[m][9] = tuv011_1; - - fdip_phi2[m][0] = 0.0; - fdip_phi2[m][1] = tuv100_2; - fdip_phi2[m][2] = tuv010_2; - fdip_phi2[m][3] = tuv001_2; - fdip_phi2[m][4] = tuv200_2; - fdip_phi2[m][5] = tuv020_2; - fdip_phi2[m][6] = tuv002_2; - fdip_phi2[m][7] = tuv110_2; - fdip_phi2[m][8] = tuv101_2; - fdip_phi2[m][9] = tuv011_2; - - fdip_sum_phi[m][0] = tuv000; - fdip_sum_phi[m][1] = tuv100; - fdip_sum_phi[m][2] = tuv010; - fdip_sum_phi[m][3] = tuv001; - fdip_sum_phi[m][4] = tuv200; - fdip_sum_phi[m][5] = tuv020; - fdip_sum_phi[m][6] = tuv002; - fdip_sum_phi[m][7] = tuv110; - fdip_sum_phi[m][8] = tuv101; - fdip_sum_phi[m][9] = tuv011; - fdip_sum_phi[m][10] = tuv300; - fdip_sum_phi[m][11] = tuv030; - fdip_sum_phi[m][12] = tuv003; - fdip_sum_phi[m][13] = tuv210; - fdip_sum_phi[m][14] = tuv201; - fdip_sum_phi[m][15] = tuv120; - fdip_sum_phi[m][16] = tuv021; - fdip_sum_phi[m][17] = tuv102; - fdip_sum_phi[m][18] = tuv012; - fdip_sum_phi[m][19] = tuv111; */ + int idx = 10*m; + fdip_phi1[idx+0] = (numtyp)0.0; + fdip_phi1[idx+1] = tuv100_1; + fdip_phi1[idx+2] = tuv010_1; + fdip_phi1[idx+3] = tuv001_1; + fdip_phi1[idx+4] = tuv200_1; + fdip_phi1[idx+5] = tuv020_1; + fdip_phi1[idx+6] = tuv002_1; + fdip_phi1[idx+7] = tuv110_1; + fdip_phi1[idx+8] = tuv101_1; + fdip_phi1[idx+9] = tuv011_1; + + fdip_phi2[idx+0] = (numtyp)0.0; + fdip_phi2[idx+1] = tuv100_2; + fdip_phi2[idx+2] = tuv010_2; + fdip_phi2[idx+3] = tuv001_2; + fdip_phi2[idx+4] = tuv200_2; + fdip_phi2[idx+5] = tuv020_2; + fdip_phi2[idx+6] = tuv002_2; + fdip_phi2[idx+7] = tuv110_2; + fdip_phi2[idx+8] = tuv101_2; + fdip_phi2[idx+9] = tuv011_2; + + idx = 20*m; + fdip_sum_phi[idx+0] = tuv000; + fdip_sum_phi[idx+1] = tuv100; + fdip_sum_phi[idx+2] = tuv010; + fdip_sum_phi[idx+3] = tuv001; + fdip_sum_phi[idx+4] = tuv200; + fdip_sum_phi[idx+5] = tuv020; + fdip_sum_phi[idx+6] = tuv002; + fdip_sum_phi[idx+7] = tuv110; + fdip_sum_phi[idx+8] = tuv101; + fdip_sum_phi[idx+9] = tuv011; + fdip_sum_phi[idx+10] = tuv300; + fdip_sum_phi[idx+11] = tuv030; + fdip_sum_phi[idx+12] = tuv003; + fdip_sum_phi[idx+13] = tuv210; + fdip_sum_phi[idx+14] = tuv201; + fdip_sum_phi[idx+15] = tuv120; + fdip_sum_phi[idx+16] = tuv021; + fdip_sum_phi[idx+17] = tuv102; + fdip_sum_phi[idx+18] = tuv012; + fdip_sum_phi[idx+19] = tuv111; } } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index c18b10675b..cd5a9abf81 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() { k_multipole.clear(); k_udirect2b.clear(); k_umutual2b.clear(); + k_fphi_uind.clear(); k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); @@ -182,6 +183,11 @@ void BaseAmoebaT::clear_atomic() { _thetai1.clear(); _thetai2.clear(); _thetai3.clear(); + _igrid.clear(); + _fdip_phi1.clear(); + _fdip_phi2.clear(); + _cgrid_brick.clear(); + _fdip_sum_phi.clear(); dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -444,34 +450,70 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall template void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** host_igrid) { + double **host_thetai3, int** host_igrid, + double* grid_brick_start, int nzlo_out, + int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out) { _bsorder = bsorder; + // allocate or resize per-atom arrays + // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax + // will be consolidated once all terms are ready + if (_max_thetai_size == 0) { _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY); + + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); + } else { if (inum_full>_max_thetai_size) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); _thetai1.resize(_max_thetai_size*bsorder*4); _thetai2.resize(_max_thetai_size*bsorder*4); _thetai3.resize(_max_thetai_size*bsorder*4); - _igrid.resize(_max_thetai_size); + _igrid.resize(_max_thetai_size*3); + + _fdip_phi1.resize(_max_thetai_size*10); + _fdip_phi2.resize(_max_thetai_size*10); + _fdip_sum_phi.resize(_max_thetai_size*20); } } - memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp)); - memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp)); - memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp)); - memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int)); - _thetai1.update_device(inum_full*bsorder*4,true); - _thetai2.update_device(inum_full*bsorder*4,true); - _thetai3.update_device(inum_full*bsorder*4,true); - _igrid.update_device(inum_full,true); + UCL_H_Vec dview; + + // copy from host to device + + dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device)); + ucl_copy(_thetai1,dview,false); + dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device)); + ucl_copy(_thetai2,dview,false); + dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device)); + ucl_copy(_thetai3,dview,false); + + UCL_H_Vec dview_int; + dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device)); + ucl_copy(_igrid,dview_int,false); + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx*_ngridy*_ngridz*2; + dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device)); + ucl_copy(_cgrid_brick,dview,false); + } // --------------------------------------------------------------------------- @@ -593,18 +635,35 @@ template void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, double **host_thetai3, int** igrid, - double ****host_cgrid_brick, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi) + double *host_grid_brick_start, double **host_fdip_phi1, + double **host_fdip_phi2, double **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out) { - // once allocation and transfers - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid); + // allocation/resize and transfers (do this right after udirect?) - // resize grid if needed, then copy from host to device - // cgrid_brick.alloc()/resize() - // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0]; - // + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, + igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out, + nxlo_out, nxhi_out); - const int red_bllocks = fphi_uind(); + // update the cgrid_brick with data host + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx*_ngridy*_ngridz*2; + + UCL_H_Vec dview; + dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device)); + ucl_copy(_cgrid_brick,dview,false); + + const int red_blocks = fphi_uind(); } @@ -814,6 +873,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); k_polar.set_function(*pair_program,kname_polar); + k_fphi_uind.set_function(*pair_program,"kname_fphi_uind"); k_short_nbor.set_function(*pair_program,kname_short_nbor); k_special15.set_function(*pair_program,kname_special15); pos_tex.get_texture(*pair_program,"pos_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index f333bdf9a6..8503e6fba4 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -152,7 +152,10 @@ class BaseAmoeba { virtual void precompute_induce(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid); + double **host_thetai3, int** igrid, + double* grid_brick_start, int nzlo_out, + int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out); /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, @@ -179,8 +182,10 @@ class BaseAmoeba { virtual void compute_fphi_uind(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, double **host_thetai3, int** igrid, - double ****host_grid, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi); + double *host_grid_brick_start, double **host_fdip_phi1, + double **host_fdip_phi2, double **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -249,9 +254,12 @@ class BaseAmoeba { int _nmax, _max_tep_size, _max_fieldp_size; int _bsorder; - UCL_Vector _thetai1, _thetai2, _thetai3; - UCL_Vector _igrid; + UCL_D_Vec _thetai1, _thetai2, _thetai3, _cgrid_brick; + UCL_D_Vec _igrid; + UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; + int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; + int _ngridx, _ngridy, _ngridz, _num_grid_points; // ------------------------ FORCE/ENERGY DATA ----------------------- @@ -272,7 +280,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind; UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -305,7 +313,6 @@ class BaseAmoeba { virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int fphi_uind() = 0; virtual int polar_real(const int eflag, const int vflag) = 0; - #if !defined(USE_OPENCL) && !defined(USE_HIP) From cad7e1b364c6b6e2a376b26b31af6386038580e3 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 2 Sep 2022 10:18:59 -0500 Subject: [PATCH 103/181] Moved fphi_uind up to BaseAmoeba --- lib/gpu/lal_amoeba.cpp | 32 -------------------------------- lib/gpu/lal_amoeba.cu | 6 +++--- lib/gpu/lal_amoeba.h | 1 - lib/gpu/lal_base_amoeba.cpp | 36 ++++++++++++++++++++++++++++++++---- lib/gpu/lal_base_amoeba.h | 2 +- 5 files changed, 36 insertions(+), 41 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 38058bab55..924a175cfe 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -259,38 +259,6 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { return GX; } -// --------------------------------------------------------------------------- -// Interpolate the potential from the PME grid -// --------------------------------------------------------------------------- -template -int AmoebaT::fphi_uind() { - int ainum=this->ans->inum(); - if (ainum == 0) - return 0; - - int _nall=this->atom->nall(); - int nbor_pitch=this->nbor->nbor_pitch(); - - // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - this->time_pair.start(); - int ngridyz = this->_ngridy * this->_ngridz; - this->k_fphi_uind.set_size(GX,BX); - this->k_fphi_uind.run(&this->atom->x, &this->_thetai1, - &this->_thetai2, &this->_thetai3, - &this->_igrid, &this->_cgrid_brick, - &this->_fdip_phi1, &this->_fdip_phi2, - &this->_fdip_sum_phi, &this->_bsorder, - &ainum, &ngridyz, &this->_ngridy, - &this->_threads_per_atom); - this->time_pair.stop(); - - return GX; -} - // --------------------------------------------------------------------------- // Calculate the polar real-space term, returning tep // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 984154f16e..200191cea2 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1706,7 +1706,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, tuv012 = (numtyp)0.0; tuv111 = (numtyp)0.0; - k = igrid[3*i+2] - nlpts; + k = igrid[4*i+2] - nlpts; for (int kb = 0; kb < bsorder; kb++) { /* v0 = thetai3[m][kb][0]; @@ -1742,7 +1742,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, tu12 = (numtyp)0.0; tu03 = (numtyp)0.0; - j = igrid[3*i+1] - nlpts; + j = igrid[4*i+1] - nlpts; for (int jb = 0; jb < bsorder; jb++) { /* u0 = thetai2[m][jb][0]; @@ -1763,7 +1763,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 = (numtyp)0.0; t3 = (numtyp)0.0; - int ii = igrid[3*i] - nlpts; + int ii = igrid[4*i] - nlpts; for (int ib = 0; ib < bsorder; ib++) { /* tq_1 = grid[k][j][ii][0]; diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index 005ea14fb9..d12b79719f 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -91,7 +91,6 @@ class Amoeba : public BaseAmoeba { int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); - int fphi_uind(); int polar_real(const int eflag, const int vflag); }; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index cd5a9abf81..1269a798db 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _max_fieldp_size = _max_tep_size; _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _max_thetai_size = 0; + _max_thetai_size = _max_tep_size; _nmax = nall; dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); @@ -466,7 +466,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); @@ -478,7 +478,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai1.resize(_max_thetai_size*bsorder*4); _thetai2.resize(_max_thetai_size*bsorder*4); _thetai3.resize(_max_thetai_size*bsorder*4); - _igrid.resize(_max_thetai_size*3); + _igrid.resize(_max_thetai_size*4); _fdip_phi1.resize(_max_thetai_size*10); _fdip_phi2.resize(_max_thetai_size*10); @@ -498,7 +498,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, ucl_copy(_thetai3,dview,false); UCL_H_Vec dview_int; - dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device)); + dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device)); ucl_copy(_igrid,dview_int,false); _nzlo_out = nzlo_out; @@ -666,6 +666,34 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, const int red_blocks = fphi_uind(); } +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_uind() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + int _nall=atom->nall(); + int nbor_pitch=nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=block_size(); + int GX=static_cast(ceil(static_cast(ans->inum())/ + (BX/_threads_per_atom))); + + time_pair.start(); + int ngridyz = _ngridy * _ngridz; + k_fphi_uind.set_size(GX,BX); + k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3, + &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2, + &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy, + &_threads_per_atom); + time_pair.stop(); + + return GX; +} // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 8503e6fba4..d3ae3a750b 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -311,7 +311,7 @@ class BaseAmoeba { virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; - virtual int fphi_uind() = 0; + virtual int fphi_uind(); virtual int polar_real(const int eflag, const int vflag) = 0; From 21b7fb2fcfb842b1f332eb737ae83fa5f89d48d2 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 2 Sep 2022 14:55:20 -0500 Subject: [PATCH 104/181] Exposing fphi_uind to the gpu pair style, still keeping the part not ready though --- lib/gpu/lal_amoeba_ext.cpp | 12 ++- lib/gpu/lal_base_amoeba.cpp | 200 ++++++++++++++++++++---------------- lib/gpu/lal_base_amoeba.h | 14 +-- src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 60 ++++++++++- src/GPU/pair_amoeba_gpu.h | 4 + 6 files changed, 193 insertions(+), 99 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 6989a5e6f6..151c38c9c4 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,9 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } -void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, +void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, double ***grid) { + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out, bool& first_iteration) { + AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); } void amoeba_setup_fft(const int numel, const int element_type) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 1269a798db..bdd43aa59e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _max_fieldp_size = _max_tep_size; _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _max_thetai_size = _max_tep_size; + _max_thetai_size = 0; _nmax = nall; dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); @@ -441,81 +441,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall return nbor->host_jlist.begin()-host_start; } -// --------------------------------------------------------------------------- -// Prepare for umutual1: bspline_fill -// - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed -// - transfer extra data from host to device -// --------------------------------------------------------------------------- - -template -void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** host_igrid, - double* grid_brick_start, int nzlo_out, - int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out) { - - _bsorder = bsorder; - - // allocate or resize per-atom arrays - // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax - // will be consolidated once all terms are ready - - if (_max_thetai_size == 0) { - _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); - - _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); - - } else { - if (inum_full>_max_thetai_size) { - _max_thetai_size=static_cast(static_cast(inum_full)*1.10); - _thetai1.resize(_max_thetai_size*bsorder*4); - _thetai2.resize(_max_thetai_size*bsorder*4); - _thetai3.resize(_max_thetai_size*bsorder*4); - _igrid.resize(_max_thetai_size*4); - - _fdip_phi1.resize(_max_thetai_size*10); - _fdip_phi2.resize(_max_thetai_size*10); - _fdip_sum_phi.resize(_max_thetai_size*20); - } - } - - UCL_H_Vec dview; - - // copy from host to device - - dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device)); - ucl_copy(_thetai1,dview,false); - dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device)); - ucl_copy(_thetai2,dview,false); - dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device)); - ucl_copy(_thetai3,dview,false); - - UCL_H_Vec dview_int; - dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device)); - ucl_copy(_igrid,dview_int,false); - - _nzlo_out = nzlo_out; - _nzhi_out = nzhi_out; - _nylo_out = nylo_out; - _nyhi_out = nyhi_out; - _nxlo_out = nxlo_out; - _nxhi_out = nxhi_out; - _ngridz = nzhi_out - nzlo_out + 1; - _ngridy = nyhi_out - nylo_out + 1; - _ngridx = nxhi_out - nxlo_out + 1; - _num_grid_points = _ngridx*_ngridy*_ngridz*2; - dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device)); - ucl_copy(_cgrid_brick,dview,false); - -} - // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- @@ -626,6 +551,98 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // _fieldp.update_host(_max_fieldp_size*8,false); } +// --------------------------------------------------------------------------- +// Prepare for umutual1() after bspline_fill() is done on host +// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed +// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 +// host_igrid is allocated with nmax by by 4 +// - transfer extra data from host to device +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** host_igrid, + double* grid_brick_start, int nzlo_out, + int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out) { + + _bsorder = bsorder; + + // allocate or resize per-atom arrays + // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax + // will be consolidated once all terms are ready + + if (_max_thetai_size == 0) { + _max_thetai_size = static_cast(static_cast(inum_full)*1.10); + _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); + + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); + + } else { + if (inum_full>_max_thetai_size) { + _max_thetai_size=static_cast(static_cast(inum_full)*1.10); + _thetai1.resize(_max_thetai_size*bsorder*4); + _thetai2.resize(_max_thetai_size*bsorder*4); + _thetai3.resize(_max_thetai_size*bsorder*4); + _igrid.resize(_max_thetai_size*4); + + _fdip_phi1.resize(_max_thetai_size*10); + _fdip_phi2.resize(_max_thetai_size*10); + _fdip_sum_phi.resize(_max_thetai_size*20); + } + } + + UCL_H_Vec dview; + dview.alloc(inum_full*bsorder*4,*(this->ucl_device)); + + // pack host data to device + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*4*bsorder + 4*j; + dview[idx+0] = host_thetai1[i][j][0]; + dview[idx+1] = host_thetai1[i][j][1]; + dview[idx+2] = host_thetai1[i][j][2]; + dview[idx+3] = host_thetai1[i][j][3]; + } + ucl_copy(_thetai1,dview,false); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*4*bsorder + 4*j; + dview[idx+0] = host_thetai2[i][j][0]; + dview[idx+1] = host_thetai2[i][j][1]; + dview[idx+2] = host_thetai2[i][j][2]; + dview[idx+3] = host_thetai2[i][j][3]; + } + ucl_copy(_thetai2,dview,false); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*4*bsorder + 4*j; + dview[idx+0] = host_thetai3[i][j][0]; + dview[idx+1] = host_thetai3[i][j][1]; + dview[idx+2] = host_thetai3[i][j][2]; + dview[idx+3] = host_thetai3[i][j][3]; + } + ucl_copy(_thetai3,dview,false); + + UCL_H_Vec dview_int; + for (int i = 0; i < inum_full; i++) { + int idx = i*4; + dview_int[idx+0] = host_igrid[i][0]; + dview_int[idx+1] = host_igrid[i][1]; + dview_int[idx+2] = host_igrid[i][2]; + } + ucl_copy(_igrid,dview_int,false); +} + // --------------------------------------------------------------------------- // fphi_uind = induced potential from grid // fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid @@ -633,19 +650,22 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double template void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid, - double *host_grid_brick_start, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void** host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out) + int nxlo_out, int nxhi_out, bool& first_iteration) { - // allocation/resize and transfers (do this right after udirect?) + // allocation/resize and transfers before the first iteration - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, - igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out, - nxlo_out, nxhi_out); - + if (first_iteration) { + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, + igrid, host_grid_brick_start, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); + if (first_iteration) first_iteration = false; + } + // update the cgrid_brick with data host _nzlo_out = nzlo_out; @@ -664,6 +684,14 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, ucl_copy(_cgrid_brick,dview,false); const int red_blocks = fphi_uind(); + + _fdip_phi1.update_host(_max_thetai_size*10); + _fdip_phi2.update_host(_max_thetai_size*10); + _fdip_sum_phi.update_host(_max_thetai_size*20); + + *host_fdip_phi1 = _fdip_phi1.host.begin(); + *host_fdip_phi2 = _fdip_phi2.host.begin(); + *host_fdip_sum_phi = _fdip_sum_phi.host.begin(); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index d3ae3a750b..a001423812 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -151,8 +151,8 @@ class BaseAmoeba { double *charge, double *boxlo, double *prd); virtual void precompute_induce(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, double* grid_brick_start, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out); @@ -180,12 +180,12 @@ class BaseAmoeba { const double aewald, const double off2_polar, void **fieldp_ptr); virtual void compute_fphi_uind(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid, - double *host_grid_brick_start, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out); + int nxlo_out, int nxhi_out, bool& first_iteration); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 93978ab1f2..17b2d4a1e8 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -407,7 +407,7 @@ class PairAmoeba : public Pair { void grid_mpole(double **, double ***); void fphi_mpole(double ***, double **); void grid_uind(double **, double **, double ****); - void fphi_uind(double ****, double **, double **, double **); + virtual void fphi_uind(double ****, double **, double **, double **); void grid_disp(double ***); void kewald(); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index cd3c01cde3..bf6db3472d 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -88,9 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void amoeba_gpu_update_fieldp(void **fieldp_ptr); -void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, - double** host_thetai1, double** host_thetai2, - double** host_thetai3, double ***grid); +void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out, bool& first_iteration); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -117,6 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; // need to be true for precompute() gpu_udirect2b_ready = true; gpu_umutual1_ready = true; + gpu_fphi_uind_ready = false; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; // need to be true for copying data from device back to host @@ -481,6 +486,8 @@ void PairAmoebaGPU::induce() // conjugate gradient iteration of the mutual induced dipoles + first_induce_iteration = true; + while (!done) { iter++; @@ -1115,6 +1122,53 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) */ } +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) +{ + if (!gpu_fphi_uind_ready) { + PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); + return; + } + + void* fdip_phi1_pinned = nullptr; + void* fdip_phi2_pinned = nullptr; + void* fdip_sum_phi_pinned = nullptr; + amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3, + igrid, ic_kspace->grid_brick_start, + &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out, + first_induce_iteration); + + int nlocal = atom->nlocal; + double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 10 * i; + for (int m = 0; m < 10; m++) + fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; + } + + double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 10 * i; + for (int m = 0; m < 10; m++) + fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m]; + } + + double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 20 * i; + for (int m = 0; m < 20; m++) + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m]; + } +} + /* ---------------------------------------------------------------------- umutual2b = Ewald real mutual field via list umutual2b computes the real space contribution of the induced diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index e0563cd8b5..fe6ed3368f 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -39,6 +39,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual1(double **, double **); + virtual void fphi_uind(double ****, double **, double **, double **); virtual void umutual2b(double **, double **); virtual void ufield0c(double **, double **); virtual void polar_real(); @@ -56,9 +57,12 @@ class PairAmoebaGPU : public PairAmoeba { bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; bool gpu_umutual1_ready; + bool gpu_fphi_uind_ready; bool gpu_umutual2b_ready; bool gpu_polar_real_ready; + bool first_induce_iteration; + void udirect2b_cpu(); template From a0af9627e5e9d2d3849ad74f1fe4d2ef7291123c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 6 Sep 2022 16:19:17 -0500 Subject: [PATCH 105/181] Fixed memory bugs with device array allocations --- lib/gpu/lal_amoeba.cu | 6 ++--- lib/gpu/lal_base_amoeba.cpp | 49 ++++++++++++++++++++----------------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 200191cea2..4a26f7f98d 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1714,7 +1714,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, v2 = thetai3[m][kb][2]; v3 = thetai3[m][kb][3]; */ - int i3 = m*4*bsorder + 4*kb; + int i3 = i*4*bsorder + 4*kb; v0 = thetai3[i3]; v1 = thetai3[i3]+1; v2 = thetai3[i3+2]; @@ -1750,7 +1750,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, u2 = thetai2[m][jb][2]; u3 = thetai2[m][jb][3]; */ - int i2 = m*4*bsorder+4*jb; + int i2 = i*4*bsorder+4*jb; u0 = thetai2[i2]; u1 = thetai2[i2+1]; u2 = thetai2[i2+2]; @@ -1776,7 +1776,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 += tq_2*thetai1[m][ib][2]; t3 += (tq_1+tq_2)*thetai1[m][ib][3]; */ - int i1 = m*4*bsorder+4*ib; + int i1 = i*4*bsorder+4*ib; numtyp w0 = thetai1[i1]; numtyp w1 = thetai1[i1+1]; numtyp w2 = thetai1[i1+2]; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index bdd43aa59e..af8d5ca481 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -563,7 +563,7 @@ template void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, - double* grid_brick_start, int nzlo_out, + double* host_grid_brick_start, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out) { @@ -580,9 +580,9 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); - _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); } else { if (inum_full>_max_thetai_size) { @@ -634,13 +634,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, ucl_copy(_thetai3,dview,false); UCL_H_Vec dview_int; + dview_int.alloc(inum_full*4, *(this->ucl_device)); for (int i = 0; i < inum_full; i++) { int idx = i*4; dview_int[idx+0] = host_igrid[i][0]; dview_int[idx+1] = host_igrid[i][1]; dview_int[idx+2] = host_igrid[i][2]; } - ucl_copy(_igrid,dview_int,false); + ucl_copy(_igrid, dview_int, false); + + // update the cgrid_brick with data host + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx * _ngridy * _ngridz; + + UCL_H_Vec dview_cgrid; + dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); + _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY); + ucl_copy(_cgrid_brick,dview_cgrid,false); + } // --------------------------------------------------------------------------- @@ -666,23 +686,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, if (first_iteration) first_iteration = false; } - // update the cgrid_brick with data host - - _nzlo_out = nzlo_out; - _nzhi_out = nzhi_out; - _nylo_out = nylo_out; - _nyhi_out = nyhi_out; - _nxlo_out = nxlo_out; - _nxhi_out = nxhi_out; - _ngridz = nzhi_out - nzlo_out + 1; - _ngridy = nyhi_out - nylo_out + 1; - _ngridx = nxhi_out - nxlo_out + 1; - _num_grid_points = _ngridx*_ngridy*_ngridz*2; - - UCL_H_Vec dview; - dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device)); - ucl_copy(_cgrid_brick,dview,false); - const int red_blocks = fphi_uind(); _fdip_phi1.update_host(_max_thetai_size*10); @@ -929,7 +932,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); k_polar.set_function(*pair_program,kname_polar); - k_fphi_uind.set_function(*pair_program,"kname_fphi_uind"); + k_fphi_uind.set_function(*pair_program,"k_fphi_uind"); k_short_nbor.set_function(*pair_program,kname_short_nbor); k_special15.set_function(*pair_program,kname_special15); pos_tex.get_texture(*pair_program,"pos_tex"); From 4b8caac727c793674abc7714d4f436a4b70d71f6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 9 Sep 2022 12:14:36 -0500 Subject: [PATCH 106/181] Made some progress with fphi_uind in the gpu pair style --- lib/gpu/lal_amoeba.cu | 61 ++++++++++++++++-------- lib/gpu/lal_amoeba_ext.cpp | 6 +-- lib/gpu/lal_base_amoeba.cpp | 93 ++++++++++++++++++++++--------------- lib/gpu/lal_base_amoeba.h | 9 ++-- src/GPU/pair_amoeba_gpu.cpp | 14 ++++-- 5 files changed, 114 insertions(+), 69 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 4a26f7f98d..b0013f0b9b 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -14,7 +14,7 @@ // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) -//#include +#include #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int @@ -1630,14 +1630,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, __global numtyp *restrict fdip_phi2, __global numtyp *restrict fdip_sum_phi, const int bsorder, const int inum, - const int nyzgrid, const int nygrid, - const int t_per_atom) + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + const int ngridxy, const int ngridx) { - int tid, ii, offset, i, n_stride; - atom_info(t_per_atom,ii,tid,offset); + //int tid, ii, offset, i, n_stride; + //atom_info(t_per_atom,ii,tid,offset); + + int tid=THREAD_ID_X; + int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; if (ii void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, - double* host_grid_brick_start, int nzlo_out, - int nzhi_out, int nylo_out, int nyhi_out, + double* host_grid_brick_start, double**** host_grid_brick, + int nzlo_out, int nzhi_out, + int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out) { _bsorder = bsorder; @@ -599,7 +600,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, } UCL_H_Vec dview; - dview.alloc(inum_full*bsorder*4,*(this->ucl_device)); + dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device)); // pack host data to device @@ -634,7 +635,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, ucl_copy(_thetai3,dview,false); UCL_H_Vec dview_int; - dview_int.alloc(inum_full*4, *(this->ucl_device)); + dview_int.alloc(_max_thetai_size*4, *(this->ucl_device)); for (int i = 0; i < inum_full; i++) { int idx = i*4; dview_int[idx+0] = host_igrid[i][0]; @@ -643,6 +644,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, } ucl_copy(_igrid, dview_int, false); + + +} + +// --------------------------------------------------------------------------- +// fphi_uind = induced potential from grid +// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, double ****host_grid_brick, + void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out, bool& first_iteration) +{ + // allocation/resize and transfers before the first iteration + + if (first_iteration) { + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, + igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); + if (first_iteration) first_iteration = false; + } + // update the cgrid_brick with data host _nzlo_out = nzlo_out; @@ -656,36 +684,27 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _ngridx = nxhi_out - nxlo_out + 1; _num_grid_points = _ngridx * _ngridy * _ngridz; - UCL_H_Vec dview_cgrid; - dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); + UCL_H_Vec hview_cgrid; + hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE); + int n = 0; + for (int iz = nzlo_out; iz <= nzhi_out; iz++) + for (int iy = nylo_out; iy <= nyhi_out; iy++) + for (int ix = nxlo_out; ix <= nxhi_out; ix++) { +/* + if (iz == nzlo_out && iy == nylo_out && ix == nxlo_out) { + printf("origin = %d %d %d: grid = %f %f %f\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1]); + } + if (iz == -2 && iy == 4 && ix == 8) printf("ixyz = %d %d %d: grid = %f %f %f; n = %d\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1], n); +*/ + hview_cgrid[n] = host_grid_brick[iz][iy][ix][0]; + hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1]; + n += 2; + } + //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY); - ucl_copy(_cgrid_brick,dview_cgrid,false); + ucl_copy(_cgrid_brick,hview_cgrid,false); -} -// --------------------------------------------------------------------------- -// fphi_uind = induced potential from grid -// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid -// --------------------------------------------------------------------------- - -template -void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double *host_grid_brick_start, void** host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration) -{ - // allocation/resize and transfers before the first iteration - - if (first_iteration) { - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, - igrid, host_grid_brick_start, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out); - if (first_iteration) first_iteration = false; - } - const int red_blocks = fphi_uind(); _fdip_phi1.update_host(_max_thetai_size*10); @@ -711,16 +730,16 @@ int BaseAmoebaT::fphi_uind() { // Compute the block size and grid size to keep all cores busy const int BX=block_size(); - int GX=static_cast(ceil(static_cast(ans->inum())/ - (BX/_threads_per_atom))); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); time_pair.start(); - int ngridyz = _ngridy * _ngridz; + int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2, - &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy, - &_threads_per_atom); + &_fdip_sum_phi, &_bsorder, &ainum, + &_nzlo_out, &_nzhi_out, &_nylo_out, &_nyhi_out, + &_nxlo_out, &_nxhi_out, &ngridxy, &_ngridx); time_pair.stop(); return GX; diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a001423812..c2c2a2d93d 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -153,8 +153,9 @@ class BaseAmoeba { virtual void precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double* grid_brick_start, int nzlo_out, - int nzhi_out, int nylo_out, int nyhi_out, + double *host_grid_brick_start, double ****host_grid_brick, + int nzlo_out, int nzhi_out, + int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out); /// Compute multipole real-space with device neighboring @@ -182,8 +183,8 @@ class BaseAmoeba { virtual void compute_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, void **host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, + double *host_grid_brick_start, double ****host_grid_brick, + void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out, bool& first_iteration); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index bf6db3472d..936cf8afbc 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -91,7 +91,7 @@ void amoeba_gpu_update_fieldp(void **fieldp_ptr); void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, void **host_fdip_phi1, + double *host_grid_brick_start, double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out, bool& first_iteration); @@ -121,7 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; // need to be true for precompute() gpu_udirect2b_ready = true; gpu_umutual1_ready = true; - gpu_fphi_uind_ready = false; + gpu_fphi_uind_ready = true; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; // need to be true for copying data from device back to host @@ -1139,7 +1139,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3, - igrid, ic_kspace->grid_brick_start, + igrid, ic_kspace->grid_brick_start, grid, &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned, ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out, @@ -1150,8 +1150,10 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { int idx = 10 * i; - for (int m = 0; m < 10; m++) - fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; + } + if (i == 0) printf("gpu fdip phi1 = %f %f %f\n", fdip_phi1[i][0], fdip_phi1[i][1], fdip_phi1[i][2]); } double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; @@ -1159,6 +1161,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, int idx = 10 * i; for (int m = 0; m < 10; m++) fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m]; + if (i == 0) printf("gpu fdip phi2 = %f %f %f\n", fdip_phi2[i][0], fdip_phi2[i][1], fdip_phi2[i][2]); } double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; @@ -1166,6 +1169,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, int idx = 20 * i; for (int m = 0; m < 20; m++) fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m]; + if (i == 0) printf("gpu fdip sum phi = %f %f %f\n", fdip_sum_phi[i][0], fdip_sum_phi[i][1], fdip_sum_phi[i][2]); } } From b72b71837ebc6de746c694c3c132f5fde5c36c80 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 9 Sep 2022 13:34:57 -0500 Subject: [PATCH 107/181] Moved first_induce_iteration in induce() to the right place --- lib/gpu/lal_base_amoeba.cpp | 6 +++--- src/GPU/pair_amoeba_gpu.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 21a97a0852..ceb9b97cbc 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -656,7 +656,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, template void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, + double ***host_thetai3, int** host_igrid, double *host_grid_brick_start, double ****host_grid_brick, void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, @@ -666,9 +666,9 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, if (first_iteration) { precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, - igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out, + host_igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out); - if (first_iteration) first_iteration = false; + first_iteration = false; } // update the cgrid_brick with data host diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 936cf8afbc..8d799a82eb 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -290,6 +290,8 @@ void PairAmoebaGPU::induce() int debug = 1; + first_induce_iteration = true; + // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -486,8 +488,6 @@ void PairAmoebaGPU::induce() // conjugate gradient iteration of the mutual induced dipoles - first_induce_iteration = true; - while (!done) { iter++; From c58343b2e29c0c514563169018f322cccf73715d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 9 Sep 2022 13:50:41 -0500 Subject: [PATCH 108/181] Cleaned up debugging stuffs, need more refactoring and add to hippo --- lib/gpu/lal_amoeba.cu | 7 +------ lib/gpu/lal_base_amoeba.cpp | 11 +---------- src/GPU/pair_amoeba_gpu.cpp | 3 --- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index b0013f0b9b..591a896bc8 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -14,7 +14,7 @@ // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) -#include +//#include #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int @@ -1789,11 +1789,6 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, int gidx = 2*(k*ngridxy + j*ngridx + i); tq_1 = grid[gidx]; tq_2 = grid[gidx+1]; -/* - if (ii == 0 && jb == 0 && kb == 0) - printf("ii = 0: igrid %d %d %d; grid %f %f; k = %d j = %d; i = %d; origin = %f %f; gidx = %d\n", - igrid[4*ii+0], igrid[4*ii+1], igrid[4*ii+2], tq_1, tq_2, k, j, i, grid[0], grid[1], gidx); -*/ t0_1 += tq_1*w0; t1_1 += tq_1*w1; t2_1 += tq_1*w2; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index ceb9b97cbc..05b830d773 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -642,10 +642,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, dview_int[idx+1] = host_igrid[i][1]; dview_int[idx+2] = host_igrid[i][2]; } - ucl_copy(_igrid, dview_int, false); - - - + ucl_copy(_igrid, dview_int, false); } // --------------------------------------------------------------------------- @@ -690,12 +687,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, for (int iz = nzlo_out; iz <= nzhi_out; iz++) for (int iy = nylo_out; iy <= nyhi_out; iy++) for (int ix = nxlo_out; ix <= nxhi_out; ix++) { -/* - if (iz == nzlo_out && iy == nylo_out && ix == nxlo_out) { - printf("origin = %d %d %d: grid = %f %f %f\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1]); - } - if (iz == -2 && iy == 4 && ix == 8) printf("ixyz = %d %d %d: grid = %f %f %f; n = %d\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1], n); -*/ hview_cgrid[n] = host_grid_brick[iz][iy][ix][0]; hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1]; n += 2; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 8d799a82eb..8618317704 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -1153,7 +1153,6 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, for (int m = 0; m < 10; m++) { fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; } - if (i == 0) printf("gpu fdip phi1 = %f %f %f\n", fdip_phi1[i][0], fdip_phi1[i][1], fdip_phi1[i][2]); } double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; @@ -1161,7 +1160,6 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, int idx = 10 * i; for (int m = 0; m < 10; m++) fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m]; - if (i == 0) printf("gpu fdip phi2 = %f %f %f\n", fdip_phi2[i][0], fdip_phi2[i][1], fdip_phi2[i][2]); } double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; @@ -1169,7 +1167,6 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, int idx = 20 * i; for (int m = 0; m < 20; m++) fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m]; - if (i == 0) printf("gpu fdip sum phi = %f %f %f\n", fdip_sum_phi[i][0], fdip_sum_phi[i][1], fdip_sum_phi[i][2]); } } From 363b6c51d0355ce9e6e470ac8462263c811de33d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 10 Sep 2022 02:31:39 -0500 Subject: [PATCH 109/181] Used local arrays and re-arranged for coalesced global memory writes --- lib/gpu/lal_amoeba.cu | 125 +++++++++++++++++------------------- src/GPU/pair_amoeba_gpu.cpp | 21 +++--- 2 files changed, 72 insertions(+), 74 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 591a896bc8..fb66158d06 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1637,12 +1637,14 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, { //int tid, ii, offset, i, n_stride; //atom_info(t_per_atom,ii,tid,offset); + int tid=THREAD_ID_X; int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; if (iinlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 10 * i; + int n = i; for (int m = 0; m < 10; m++) { - fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; } } double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 10 * i; - for (int m = 0; m < 10; m++) - fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m]; + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } } double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; for (int i = 0; i < nlocal; i++) { - int idx = 20 * i; - for (int m = 0; m < 20; m++) - fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m]; + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } } } From 5e59c95be403b26e59e8b914e2b43fe31441dd9f Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 10 Sep 2022 02:45:06 -0500 Subject: [PATCH 110/181] Moved temp variables inside loops --- lib/gpu/lal_amoeba.cu | 187 ++++++++++++++++++------------------------ 1 file changed, 82 insertions(+), 105 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index fb66158d06..105f18cfa8 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1643,75 +1643,52 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; if (ii Date: Sun, 11 Sep 2022 18:58:34 -0500 Subject: [PATCH 111/181] Re-arranged memory allocation for cgrid_brick, some issues need to be fixed --- lib/gpu/lal_amoeba.cu | 2 +- lib/gpu/lal_amoeba_ext.cpp | 15 ++++--- lib/gpu/lal_base_amoeba.cpp | 87 +++++++++++++++++++++---------------- lib/gpu/lal_base_amoeba.h | 19 ++++---- 4 files changed, 69 insertions(+), 54 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 105f18cfa8..d67fa4f869 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1646,7 +1646,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, //numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i]; acctyp fdip_buf[32]; - int j,k,m; + int j,k; int nlpts = (bsorder-1) / 2; // extract the permanent multipole field at each site diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 95b7237e46..f91b76f688 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -164,15 +164,16 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double *host_grid_brick_start, double ****host_grid_brick, + double ***host_thetai3, int** igrid, double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration) { + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + bool& first_iteration) { AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, host_grid_brick_start, host_grid_brick, host_fdip_phi1, - host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); + host_thetai3, igrid, host_grid_brick, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); } void amoeba_setup_fft(const int numel, const int element_type) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 05b830d773..dfd5565f1e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -186,8 +186,10 @@ void BaseAmoebaT::clear_atomic() { _igrid.clear(); _fdip_phi1.clear(); _fdip_phi2.clear(); - _cgrid_brick.clear(); _fdip_sum_phi.clear(); + _cgrid_brick.clear(); + hview_cgrid.clear(); + dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -563,10 +565,9 @@ template void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, - double* host_grid_brick_start, double**** host_grid_brick, - int nzlo_out, int nzhi_out, - int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out) { + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { _bsorder = bsorder; @@ -642,34 +643,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, dview_int[idx+1] = host_igrid[i][1]; dview_int[idx+2] = host_igrid[i][2]; } - ucl_copy(_igrid, dview_int, false); -} + ucl_copy(_igrid, dview_int, false); -// --------------------------------------------------------------------------- -// fphi_uind = induced potential from grid -// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid -// --------------------------------------------------------------------------- - -template -void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** host_igrid, - double *host_grid_brick_start, double ****host_grid_brick, - void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration) -{ - // allocation/resize and transfers before the first iteration - - if (first_iteration) { - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, - host_igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out); - first_iteration = false; - } - - // update the cgrid_brick with data host - _nzlo_out = nzlo_out; _nzhi_out = nzhi_out; _nylo_out = nylo_out; @@ -681,8 +656,47 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, _ngridx = nxhi_out - nxlo_out + 1; _num_grid_points = _ngridx * _ngridy * _ngridz; - UCL_H_Vec hview_cgrid; - hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE); + int numel = _num_grid_points*2; + if (_cgrid_brick.cols() == 0) { + hview_cgrid.alloc(numel, *(this->ucl_device), UCL_READ_WRITE); + _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_ONLY); + } else if (numel > _cgrid_brick.cols()) { + hview_cgrid.resize(numel); + _cgrid_brick.resize(numel); + } +} + +// --------------------------------------------------------------------------- +// fphi_uind = induced potential from grid +// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** host_igrid, + double ****host_grid_brick, + void** host_fdip_phi1, + void **host_fdip_phi2, + void **host_fdip_sum_phi, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + bool& first_iteration) +{ + // TODO: find out why this alloc helps makes the cgrid_brick ucl_copy work + UCL_H_Vec hview; + hview.alloc(1, *(this->ucl_device), UCL_READ_ONLY); + + // allocation/resize and transfers before the first iteration + + if (first_iteration) { + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, host_igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); + first_iteration = false; + } + int n = 0; for (int iz = nzlo_out; iz <= nzhi_out; iz++) for (int iy = nylo_out; iy <= nyhi_out; iy++) @@ -691,10 +705,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1]; n += 2; } - //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); - _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY); - ucl_copy(_cgrid_brick,hview_cgrid,false); - + ucl_copy(_cgrid_brick, hview_cgrid, false); const int red_blocks = fphi_uind(); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index c2c2a2d93d..a4a7a8d1a7 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -153,10 +153,9 @@ class BaseAmoeba { virtual void precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, double ****host_grid_brick, - int nzlo_out, int nzhi_out, - int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out); + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, @@ -183,10 +182,12 @@ class BaseAmoeba { virtual void compute_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, double ****host_grid_brick, + double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration); + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + bool& first_iteration); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -255,7 +256,9 @@ class BaseAmoeba { int _nmax, _max_tep_size, _max_fieldp_size; int _bsorder; - UCL_D_Vec _thetai1, _thetai2, _thetai3, _cgrid_brick; + UCL_D_Vec _thetai1, _thetai2, _thetai3; + UCL_H_Vec hview_cgrid; + UCL_D_Vec _cgrid_brick; UCL_D_Vec _igrid; UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; From 17e54c939019466f8b78c4cfb0372499925bb3c3 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 11 Sep 2022 19:00:40 -0500 Subject: [PATCH 112/181] Updated the GPU API in the gpu pair style --- src/GPU/pair_amoeba_gpu.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 4c77417ff0..3790ca4231 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -91,10 +91,12 @@ void amoeba_gpu_update_fieldp(void **fieldp_ptr); void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, double ****host_grid_brick, void **host_fdip_phi1, + double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration); + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + bool& first_iteration); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -1138,13 +1140,14 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_phi1_pinned = nullptr; void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; - amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3, - igrid, ic_kspace->grid_brick_start, grid, - &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out, - first_induce_iteration); + amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, + thetai2, thetai3, igrid, grid, + &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out, + first_induce_iteration); int nlocal = atom->nlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; From 31047b4a316413b15d856a3a32256aefa77195e9 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 13 Sep 2022 12:53:48 -0500 Subject: [PATCH 113/181] Removed mem alloc in precompute_induce, used buffer for packing, and switched to using ucl_vector --- lib/gpu/lal_base_amoeba.cpp | 79 +++++++++++++++++++------------------ lib/gpu/lal_base_amoeba.h | 8 ++-- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index dfd5565f1e..5989ba889d 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -188,7 +188,8 @@ void BaseAmoebaT::clear_atomic() { _fdip_phi2.clear(); _fdip_sum_phi.clear(); _cgrid_brick.clear(); - hview_cgrid.clear(); + + hview.clear(); dev_nspecial15.clear(); dev_special15.clear(); @@ -586,6 +587,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); + hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device)); + } else { if (inum_full>_max_thetai_size) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); @@ -597,53 +600,53 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _fdip_phi1.resize(_max_thetai_size*10); _fdip_phi2.resize(_max_thetai_size*10); _fdip_sum_phi.resize(_max_thetai_size*20); + + hview.resize(_max_thetai_size*bsorder*4); } } - UCL_H_Vec dview; - dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device)); - // pack host data to device for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { int idx = i*4*bsorder + 4*j; - dview[idx+0] = host_thetai1[i][j][0]; - dview[idx+1] = host_thetai1[i][j][1]; - dview[idx+2] = host_thetai1[i][j][2]; - dview[idx+3] = host_thetai1[i][j][3]; + hview[idx+0] = host_thetai1[i][j][0]; + hview[idx+1] = host_thetai1[i][j][1]; + hview[idx+2] = host_thetai1[i][j][2]; + hview[idx+3] = host_thetai1[i][j][3]; } - ucl_copy(_thetai1,dview,false); + ucl_copy(_thetai1,hview,false); for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { int idx = i*4*bsorder + 4*j; - dview[idx+0] = host_thetai2[i][j][0]; - dview[idx+1] = host_thetai2[i][j][1]; - dview[idx+2] = host_thetai2[i][j][2]; - dview[idx+3] = host_thetai2[i][j][3]; + hview[idx+0] = host_thetai2[i][j][0]; + hview[idx+1] = host_thetai2[i][j][1]; + hview[idx+2] = host_thetai2[i][j][2]; + hview[idx+3] = host_thetai2[i][j][3]; } - ucl_copy(_thetai2,dview,false); + ucl_copy(_thetai2,hview,false); for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { int idx = i*4*bsorder + 4*j; - dview[idx+0] = host_thetai3[i][j][0]; - dview[idx+1] = host_thetai3[i][j][1]; - dview[idx+2] = host_thetai3[i][j][2]; - dview[idx+3] = host_thetai3[i][j][3]; + hview[idx+0] = host_thetai3[i][j][0]; + hview[idx+1] = host_thetai3[i][j][1]; + hview[idx+2] = host_thetai3[i][j][2]; + hview[idx+3] = host_thetai3[i][j][3]; } - ucl_copy(_thetai3,dview,false); + ucl_copy(_thetai3,hview,false); - UCL_H_Vec dview_int; - dview_int.alloc(_max_thetai_size*4, *(this->ucl_device)); + //UCL_H_Vec dview_int; + //dview_int.alloc(_max_thetai_size*4, *(this->ucl_device)); for (int i = 0; i < inum_full; i++) { int idx = i*4; - dview_int[idx+0] = host_igrid[i][0]; - dview_int[idx+1] = host_igrid[i][1]; - dview_int[idx+2] = host_igrid[i][2]; + _igrid[idx+0] = host_igrid[i][0]; + _igrid[idx+1] = host_igrid[i][1]; + _igrid[idx+2] = host_igrid[i][2]; } - ucl_copy(_igrid, dview_int, false); + //ucl_copy(_igrid, dview_int, false); + _igrid.update_device(false); _nzlo_out = nzlo_out; _nzhi_out = nzhi_out; @@ -658,10 +661,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, int numel = _num_grid_points*2; if (_cgrid_brick.cols() == 0) { - hview_cgrid.alloc(numel, *(this->ucl_device), UCL_READ_WRITE); - _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_ONLY); + _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); } else if (numel > _cgrid_brick.cols()) { - hview_cgrid.resize(numel); _cgrid_brick.resize(numel); } } @@ -684,10 +685,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, const int nxlo_out, const int nxhi_out, bool& first_iteration) { - // TODO: find out why this alloc helps makes the cgrid_brick ucl_copy work - UCL_H_Vec hview; - hview.alloc(1, *(this->ucl_device), UCL_READ_ONLY); - // allocation/resize and transfers before the first iteration if (first_iteration) { @@ -697,15 +694,19 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, first_iteration = false; } + // TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly + UCL_H_Vec hdummy; + hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); + int n = 0; - for (int iz = nzlo_out; iz <= nzhi_out; iz++) - for (int iy = nylo_out; iy <= nyhi_out; iy++) - for (int ix = nxlo_out; ix <= nxhi_out; ix++) { - hview_cgrid[n] = host_grid_brick[iz][iy][ix][0]; - hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1]; + for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) + for (int iy = _nylo_out; iy <= _nyhi_out; iy++) + for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { + _cgrid_brick[n] = host_grid_brick[iz][iy][ix][0]; + _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1]; n += 2; } - ucl_copy(_cgrid_brick, hview_cgrid, false); + _cgrid_brick.update_device(false); const int red_blocks = fphi_uind(); @@ -762,7 +763,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, int** firstneigh = nullptr; cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); - atom->add_extra_data(); + atom->add_extra_data(); *tep_ptr=_tep.host.begin(); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a4a7a8d1a7..760d0e3005 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -257,14 +257,16 @@ class BaseAmoeba { int _bsorder; UCL_D_Vec _thetai1, _thetai2, _thetai3; - UCL_H_Vec hview_cgrid; - UCL_D_Vec _cgrid_brick; - UCL_D_Vec _igrid; + UCL_Vector _igrid; + UCL_Vector _cgrid_brick; UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _ngridx, _ngridy, _ngridz, _num_grid_points; + /// buffer + UCL_H_Vec hview; + // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; From 9c4d3db5584635066410d13ce89d9c3edd4bdb3d Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 13 Sep 2022 16:48:39 -0500 Subject: [PATCH 114/181] Cleaned up and converted arrays to ucl_vector of numtyp4 --- lib/gpu/lal_amoeba.cu | 39 +++++++++++---------- lib/gpu/lal_base_amoeba.cpp | 68 +++++++++++++++++-------------------- lib/gpu/lal_base_amoeba.h | 5 +-- 3 files changed, 54 insertions(+), 58 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index d67fa4f869..53a9f6aa3e 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1621,9 +1621,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, - const __global numtyp *restrict thetai1, - const __global numtyp *restrict thetai2, - const __global numtyp *restrict thetai3, + const __global numtyp4 *restrict thetai1, + const __global numtyp4 *restrict thetai2, + const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, const __global numtyp *restrict grid, __global numtyp *restrict fdip_phi1, @@ -1698,11 +1698,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, v2 = thetai3[m][kb][2]; v3 = thetai3[m][kb][3]; */ - int i3 = ii*4*bsorder + 4*kb; - numtyp v0 = thetai3[i3]; - numtyp v1 = thetai3[i3+1]; - numtyp v2 = thetai3[i3+2]; - numtyp v3 = thetai3[i3+3]; + int i3 = ii*bsorder + kb; + numtyp4 tha3 = thetai3[i3]; + numtyp v0 = tha3.x; + numtyp v1 = tha3.y; + numtyp v2 = tha3.z; + numtyp v3 = tha3.w; numtyp tu00_1 = (numtyp)0.0; numtyp tu01_1 = (numtyp)0.0; numtyp tu10_1 = (numtyp)0.0; @@ -1734,11 +1735,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, u2 = thetai2[m][jb][2]; u3 = thetai2[m][jb][3]; */ - int i2 = ii*4*bsorder+4*jb; - numtyp u0 = thetai2[i2]; - numtyp u1 = thetai2[i2+1]; - numtyp u2 = thetai2[i2+2]; - numtyp u3 = thetai2[i2+3]; + int i2 = ii*bsorder+jb; + numtyp4 tha2 = thetai2[i2]; + numtyp u0 = tha2.x; + numtyp u1 = tha2.y; + numtyp u2 = tha2.z; + numtyp u3 = tha2.w; numtyp t0_1 = (numtyp)0.0; numtyp t1_1 = (numtyp)0.0; numtyp t2_1 = (numtyp)0.0; @@ -1760,11 +1762,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 += tq_2*thetai1[m][ib][2]; t3 += (tq_1+tq_2)*thetai1[m][ib][3]; */ - int i1 = ii*4*bsorder+4*ib; - numtyp w0 = thetai1[i1]; - numtyp w1 = thetai1[i1+1]; - numtyp w2 = thetai1[i1+2]; - numtyp w3 = thetai1[i1+3]; + int i1 = ii*bsorder+ib; + numtyp4 tha1 = thetai1[i1]; + numtyp w0 = tha1.x; + numtyp w1 = tha1.y; + numtyp w2 = tha1.z; + numtyp w3 = tha1.w; int gidx = 2*(k*ngridxy + j*ngridx + i); numtyp tq_1 = grid[gidx]; numtyp tq_2 = grid[gidx+1]; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 5989ba889d..3e14159d5a 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -189,8 +189,6 @@ void BaseAmoebaT::clear_atomic() { _fdip_sum_phi.clear(); _cgrid_brick.clear(); - hview.clear(); - dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -578,30 +576,25 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, if (_max_thetai_size == 0) { _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); - - hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device)); - } else { if (inum_full>_max_thetai_size) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); - _thetai1.resize(_max_thetai_size*bsorder*4); - _thetai2.resize(_max_thetai_size*bsorder*4); - _thetai3.resize(_max_thetai_size*bsorder*4); + _thetai1.resize(_max_thetai_size*bsorder); + _thetai2.resize(_max_thetai_size*bsorder); + _thetai3.resize(_max_thetai_size*bsorder); _igrid.resize(_max_thetai_size*4); _fdip_phi1.resize(_max_thetai_size*10); _fdip_phi2.resize(_max_thetai_size*10); _fdip_sum_phi.resize(_max_thetai_size*20); - - hview.resize(_max_thetai_size*bsorder*4); } } @@ -609,44 +602,47 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { - int idx = i*4*bsorder + 4*j; - hview[idx+0] = host_thetai1[i][j][0]; - hview[idx+1] = host_thetai1[i][j][1]; - hview[idx+2] = host_thetai1[i][j][2]; - hview[idx+3] = host_thetai1[i][j][3]; + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai1[i][j][0]; + v.y = host_thetai1[i][j][1]; + v.z = host_thetai1[i][j][2]; + v.w = host_thetai1[i][j][3]; + _thetai1[idx] = v; } - ucl_copy(_thetai1,hview,false); + _thetai1.update_device(true); for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { - int idx = i*4*bsorder + 4*j; - hview[idx+0] = host_thetai2[i][j][0]; - hview[idx+1] = host_thetai2[i][j][1]; - hview[idx+2] = host_thetai2[i][j][2]; - hview[idx+3] = host_thetai2[i][j][3]; + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai2[i][j][0]; + v.y = host_thetai2[i][j][1]; + v.z = host_thetai2[i][j][2]; + v.w = host_thetai2[i][j][3]; + _thetai2[idx] = v; } - ucl_copy(_thetai2,hview,false); + _thetai2.update_device(true); for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { - int idx = i*4*bsorder + 4*j; - hview[idx+0] = host_thetai3[i][j][0]; - hview[idx+1] = host_thetai3[i][j][1]; - hview[idx+2] = host_thetai3[i][j][2]; - hview[idx+3] = host_thetai3[i][j][3]; + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai3[i][j][0]; + v.y = host_thetai3[i][j][1]; + v.z = host_thetai3[i][j][2]; + v.w = host_thetai3[i][j][3]; + _thetai3[idx] = v; } - ucl_copy(_thetai3,hview,false); + _thetai3.update_device(true); - //UCL_H_Vec dview_int; - //dview_int.alloc(_max_thetai_size*4, *(this->ucl_device)); for (int i = 0; i < inum_full; i++) { int idx = i*4; _igrid[idx+0] = host_igrid[i][0]; _igrid[idx+1] = host_igrid[i][1]; _igrid[idx+2] = host_igrid[i][2]; } - //ucl_copy(_igrid, dview_int, false); - _igrid.update_device(false); + _igrid.update_device(true); _nzlo_out = nzlo_out; _nzhi_out = nzhi_out; @@ -694,7 +690,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, first_iteration = false; } - // TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly + // TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly UCL_H_Vec hdummy; hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 760d0e3005..802b6962b7 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -256,7 +256,7 @@ class BaseAmoeba { int _nmax, _max_tep_size, _max_fieldp_size; int _bsorder; - UCL_D_Vec _thetai1, _thetai2, _thetai3; + UCL_Vector _thetai1, _thetai2, _thetai3; UCL_Vector _igrid; UCL_Vector _cgrid_brick; UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; @@ -264,9 +264,6 @@ class BaseAmoeba { int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _ngridx, _ngridy, _ngridz, _num_grid_points; - /// buffer - UCL_H_Vec hview; - // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; From cd3a00c2c44086c7c2531e5f61c2985789e5658c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 14 Sep 2022 15:28:44 -0500 Subject: [PATCH 115/181] Added timing breakdown for fphi_uind --- lib/gpu/lal_hippo.cpp | 27 --------------------------- lib/gpu/lal_hippo.h | 1 - src/AMOEBA/pair_amoeba.cpp | 7 ++++++- src/AMOEBA/pair_amoeba.h | 1 + src/GPU/pair_amoeba_gpu.cpp | 7 +++++++ 5 files changed, 14 insertions(+), 29 deletions(-) diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index d980ae0ed6..79a8772c3e 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -592,33 +592,6 @@ int HippoT::umutual2b(const int eflag, const int vflag) { return GX; } -// --------------------------------------------------------------------------- -// Interpolate the potential from the PME grid -// --------------------------------------------------------------------------- -template -int HippoT::fphi_uind() { - int ainum=this->ans->inum(); - if (ainum == 0) - return 0; - - int _nall=this->atom->nall(); - int nbor_pitch=this->nbor->nbor_pitch(); - - // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); -/* - this->time_pair.start(); - - this->k_fphi_uind.set_size(GX,BX); - this->k_fphi_uind.run(); - this->time_pair.stop(); -*/ - - return GX; -} - // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index cece72caac..492712eb85 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -157,7 +157,6 @@ class Hippo : public BaseAmoeba { int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); - int fphi_uind(); int polar_real(const int eflag, const int vflag); }; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 3b66ebc221..9890904e42 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -348,6 +348,7 @@ void PairAmoeba::compute(int eflag, int vflag) time_mutual_rspace = time_mutual_kspace = 0.0; time_polar_rspace = time_polar_kspace = 0.0; + time_fphi_uind = 0.0; if (ic_kspace) { ic_kspace->time_fft = 0.0; } @@ -546,6 +547,9 @@ void PairAmoeba::finish() MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_polar_kspace = ave/comm->nprocs; + MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_fphi_uind = ave/comm->nprocs; + double time_mutual_fft = ic_kspace->time_fft; MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_mutual_fft = ave/comm->nprocs; @@ -578,7 +582,8 @@ void PairAmoeba::finish() utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); - utils::logmesg(lmp," - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); + utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); + utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); } diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 17b2d4a1e8..a95065d851 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -92,6 +92,7 @@ class PairAmoeba : public Pair { double time_direct_rspace,time_direct_kspace; double time_mutual_rspace,time_mutual_kspace; double time_polar_rspace,time_polar_kspace; + double time_fphi_uind; // energy/virial components diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3790ca4231..b85db8ea47 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -1058,9 +1058,16 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) double ****gridpost = (double ****) ic_kspace->post_convolution(); // get potential + double time0, time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + time1 = MPI_Wtime(); + time_fphi_uind += (time1 - time0); + // store fractional reciprocal potentials for OPT method if (poltyp == OPT) { From 0359d405802f295b933a93da9515a73eb9c17897 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 14 Sep 2022 16:11:43 -0500 Subject: [PATCH 116/181] Added interpolation timing for the cpu version --- src/AMOEBA/amoeba_induce.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 01491a8708..90a52ca402 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -940,9 +940,16 @@ void PairAmoeba::umutual1(double **field, double **fieldp) double ****gridpost = (double ****) ic_kspace->post_convolution(); // get potential + double time0, time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + time1 = MPI_Wtime(); + time_fphi_uind += (time1 - time0); + // store fractional reciprocal potentials for OPT method if (poltyp == OPT) { From 880f20c2858c0fb4c855c0fcf84a3aaaa86af533 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 15 Sep 2022 15:29:14 -0500 Subject: [PATCH 117/181] Cleaned up kernels --- lib/gpu/lal_amoeba.cpp | 5 +- lib/gpu/lal_amoeba.cu | 47 +- lib/gpu/lal_base_amoeba.cpp | 8 +- src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 951 ++++++++++++++++++++++++++++++++++++ src/GPU/pair_amoeba_gpu.h | 1 + 6 files changed, 989 insertions(+), 25 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 924a175cfe..48316e9b6e 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -117,7 +117,10 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, _allocated=true; this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() - + sp_amoeba.row_bytes() + this->_tep.row_bytes(); + + sp_amoeba.row_bytes() + this->_tep.row_bytes() + + this->_fieldp.row_bytes() + this->_thetai1.row_bytes() + + this->_thetai2.row_bytes() + this->_thetai3.row_bytes() + + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes(); return 0; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 53a9f6aa3e..d391279f5d 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -849,7 +849,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, if (damp != (numtyp)0.0) { numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype] if (pgamma != (numtyp)0.0) { - damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + //damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + numtyp tmp = r*ucl_recip(damp); + damp = pgamma * ucl_sqrt(tmp*tmp*tmp); if (damp < (numtyp)50.0) { numtyp expdamp = ucl_exp(-damp) ; scale3 = (numtyp)1.0 - expdamp ; @@ -858,7 +860,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, } } else { pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] - damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + numtyp tmp = r*ucl_recip(damp); + damp = pgamma * (tmp*tmp*tmp); if (damp < (numtyp)50.0) { numtyp expdamp = ucl_exp(-damp); scale3 = (numtyp)1.0 - expdamp; @@ -1314,7 +1318,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] if (damp != (numtyp)0.0) { numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] - damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); + numtyp tmp = r*ucl_recip(damp); + damp = pgamma * (tmp*tmp*tmp); if (damp < (numtyp)50.0) { numtyp expdamp = ucl_exp(-damp); sc3 = (numtyp)1.0 - expdamp; @@ -1620,8 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ -__kernel void k_fphi_uind(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict thetai1, +__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, @@ -1630,10 +1635,9 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, __global numtyp *restrict fdip_phi2, __global numtyp *restrict fdip_sum_phi, const int bsorder, const int inum, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - const int ngridxy, const int ngridx) + const int nzlo_out, const int nylo_out, + const int nxlo_out, const int ngridxy, + const int ngridx) { //int tid, ii, offset, i, n_stride; //atom_info(t_per_atom,ii,tid,offset); @@ -1643,11 +1647,16 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; if (iix, &_thetai1, &_thetai2, &_thetai3, - &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2, - &_fdip_sum_phi, &_bsorder, &ainum, - &_nzlo_out, &_nzhi_out, &_nylo_out, &_nyhi_out, - &_nxlo_out, &_nxhi_out, &ngridxy, &_ngridx); + k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, + &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum, + &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); time_pair.stop(); return GX; diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index a95065d851..24ce6fcfbc 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -374,7 +374,7 @@ class PairAmoeba : public Pair { void polar(); void polar_energy(); virtual void polar_real(); - void polar_kspace(); + virtual void polar_kspace(); void damppole(double, int, double, double, double *, double *, double *); virtual void induce(); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index b85db8ea47..e62c8185be 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -1278,6 +1278,957 @@ void PairAmoebaGPU::polar_real() } } +/* ---------------------------------------------------------------------- + polar_kspace = KSpace portion of induced dipole polarization + adapted from Tinker eprecip1() routine + same as PairAmoeba, except that fphi_uind() is reimplemented here + ------------------------------------------------------------------------- */ + +void PairAmoebaGPU::polar_kspace() +{ + int i,j,k,m,n; + int nhalf1,nhalf2,nhalf3; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + int j1,j2,j3; + int ix,iy,iz; + double eterm,felec; + double r1,r2,r3; + double h1,h2,h3; + double f1,f2,f3; + double xix,yix,zix; + double xiy,yiy,ziy; + double xiz,yiz,ziz; + double vxx,vyy,vzz; + double vxy,vxz,vyz; + double volterm,denom; + double hsq,expterm; + double term,pterm; + double vterm,struc2; + double tep[3]; + double fix[3],fiy[3],fiz[3]; + double cphid[4],cphip[4]; + double a[3][3]; // indices not flipped vs Fortran + + // indices into the electrostatic field array + // decremented by 1 versus Fortran + + int deriv1[10] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; + int deriv2[10] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; + int deriv3[10] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // owned atoms + + double **x = atom->x; + double **f = atom->f; + int nlocal = atom->nlocal; + + double volbox = domain->prd[0] * domain->prd[1] * domain->prd[2]; + pterm = pow((MY_PI/aewald),2.0); + volterm = MY_PI * volbox; + + // initialize variables required for the scalar summation + + felec = electric / am_dielectric; + + // remove scalar sum virial from prior multipole FFT + // can only do this if multipoles were computed with same aeewald = apewald + // else need to re-compute it via new long-range solve + + nfft1 = p_kspace->nx; + nfft2 = p_kspace->ny; + nfft3 = p_kspace->nz; + bsorder = p_kspace->order; + + nhalf1 = (nfft1+1) / 2; + nhalf2 = (nfft2+1) / 2; + nhalf3 = (nfft3+1) / 2; + + nxlo = p_kspace->nxlo_fft; + nxhi = p_kspace->nxhi_fft; + nylo = p_kspace->nylo_fft; + nyhi = p_kspace->nyhi_fft; + nzlo = p_kspace->nzlo_fft; + nzhi = p_kspace->nzhi_fft; + + // use previous results or compute new qfac and convolution + + if (aewald == aeewald) { + vxx = -vmsave[0]; + vyy = -vmsave[1]; + vzz = -vmsave[2]; + vxy = -vmsave[3]; + vxz = -vmsave[4]; + vyz = -vmsave[5]; + + } else { + + // setup stencil size and B-spline coefficients + + moduli(); + bspline_fill(); + + // convert Cartesian multipoles to fractional coordinates + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + + double ***gridpre = (double ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + double *gridfft = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + // zero virial accumulation variables + + vxx = vyy = vzz = vxy = vxz = vyz = 0.0; + + // perform convolution on K-space points I own + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + if (hsq) expterm = exp(term) / denom; + struc2 = gridfft[n]*gridfft[n] + gridfft[n+1]*gridfft[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx -= h1*h1*vterm - eterm; + vyy -= h2*h2*vterm - eterm; + vzz -= h3*h3*vterm - eterm; + vxy -= h1*h2*vterm; + vxz -= h1*h3*vterm; + vyz -= h2*h3*vterm; + } + + expterm = qfac[m++]; + gridfft[n] *= expterm; + gridfft[n+1] *= expterm; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 3d grid in brick decomp w/ ghost values + + double ***gridpost = (double ***) p_kspace->post_convolution(); + + // get potential + + fphi_mpole(gridpost,fphi); + + for (i = 0; i < nlocal; i++) { + for (k = 0; k < 20; k++) + fphi[i][k] *= felec; + } + + // convert field from fractional to Cartesian + + fphi_to_cphi(fphi,cphi); + } + + // convert Cartesian induced dipoles to fractional coordinates + + for (i = 0; i < 3; i++) { + a[0][i] = nfft1 * recip[0][i]; + a[1][i] = nfft2 * recip[1][i]; + a[2][i] = nfft3 * recip[2][i]; + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; + fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; + } + } + + // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values + + double ****gridpre2 = (double ****) pc_kspace->zero(); + + // map 2 values to grid + + grid_uind(fuind,fuinp,gridpre2); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + double *gridfft = pc_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + // use qfac values from above or from induce() + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + double ****gridpost = (double ****) pc_kspace->post_convolution(); + + // get potential + + fphi_uind(gridpost,fphid,fphip,fphidp); + + // TODO: port the remaining loops to the GPU + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 10; j++) { + fphid[i][j] = felec * fphid[i][j]; + fphip[i][j] = felec * fphip[i][j]; + } + for (j = 0; j < 20; j++) + fphidp[i][j] = felec * fphidp[i][j]; + } + + // increment the dipole polarization gradient contributions + + for (i = 0; i < nlocal; i++) { + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + for (k = 0; k < 3; k++) { + j1 = deriv1[k+1]; + j2 = deriv2[k+1]; + j3 = deriv3[k+1]; + f1 += (fuind[i][k]+fuinp[i][k])*fphi[i][j1]; + f2 += (fuind[i][k]+fuinp[i][k])*fphi[i][j2]; + f3 += (fuind[i][k]+fuinp[i][k])*fphi[i][j3]; + if (poltyp == MUTUAL) { + f1 += fuind[i][k]*fphip[i][j1] + fuinp[i][k]*fphid[i][j1]; + f2 += fuind[i][k]*fphip[i][j2] + fuinp[i][k]*fphid[i][j2]; + f3 += fuind[i][k]*fphip[i][j3] + fuinp[i][k]*fphid[i][j3]; + } + } + for (k = 0; k < 10; k++) { + f1 += fmp[i][k]*fphidp[i][deriv1[k]]; + f2 += fmp[i][k]*fphidp[i][deriv2[k]]; + f3 += fmp[i][k]*fphidp[i][deriv3[k]]; + } + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + f[i][0] -= h1; + f[i][1] -= h2; + f[i][2] -= h3; + } + + // set the potential to be the induced dipole average + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + fphidp[i][j] *= 0.5; + } + + fphi_to_cphi(fphidp,cphidp); + + // get the fractional to Cartesian transformation matrix + + //frac_to_cart(); + + // increment the dipole polarization virial contributions + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (k = 1; k < 4; k++) { + cphid[j] += ftc[j][k]*fphid[i][k]; + cphip[j] += ftc[j][k]*fphip[i][k]; + } + } + + vxx -= cmp[i][1]*cphidp[i][1] + + 0.5*((uind[i][0]+uinp[i][0])*cphi[i][1]); + vyy -= cmp[i][2]*cphidp[i][2] + + 0.5*((uind[i][1]+uinp[i][1])*cphi[i][2]); + vzz -= cmp[i][3]*cphidp[i][3] + + 0.5*((uind[i][2]+uinp[i][2])*cphi[i][3]); + vxy -= 0.5*(cphidp[i][1]*cmp[i][2]+cphidp[i][2]*cmp[i][1]) + + 0.25*((uind[i][1]+uinp[i][1])*cphi[i][1] + + (uind[i][0]+uinp[i][0])*cphi[i][2]); + vyz -= 0.5*(cphidp[i][2]*cmp[i][3]+cphidp[i][3]*cmp[i][2]) + + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][2] + + (uind[i][1]+uinp[i][1])*cphi[i][3]); + vxz -= 0.5*(cphidp[i][1]*cmp[i][3]+cphidp[i][3]*cmp[i][1]) + + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][1] + + (uind[i][0]+uinp[i][0])*cphi[i][3]); + + vxx -= 2.0*cmp[i][4]*cphidp[i][4] + cmp[i][7]*cphidp[i][7] + + cmp[i][8]*cphidp[i][8]; + vyy -= 2.0*cmp[i][5]*cphidp[i][5] + cmp[i][7]*cphidp[i][7] + + cmp[i][9]*cphidp[i][9]; + vzz -= 2.0*cmp[i][6]*cphidp[i][6] + cmp[i][8]*cphidp[i][8] + + cmp[i][9]*cphidp[i][9]; + vxy -= (cmp[i][4]+cmp[i][5])*cphidp[i][7] + + 0.5*(cmp[i][7]*(cphidp[i][5]+cphidp[i][4]) + + cmp[i][8]*cphidp[i][9]+cmp[i][9]*cphidp[i][8]); + vyz -= (cmp[i][5]+cmp[i][6])*cphidp[i][9] + + 0.5*(cmp[i][9]*(cphidp[i][5]+cphidp[i][6]) + + cmp[i][7]*cphidp[i][8]+cmp[i][8]*cphidp[i][7]); + vxz -= (cmp[i][4]+cmp[i][6])*cphidp[i][8] + + 0.5*(cmp[i][8]*(cphidp[i][4]+cphidp[i][6]) + + cmp[i][7]*cphidp[i][9]+cmp[i][9]*cphidp[i][7]); + + if (poltyp == MUTUAL) { + vxx -= 0.5 * (cphid[1]*uinp[i][0]+cphip[1]*uind[i][0]); + vyy -= 0.5 * (cphid[2]*uinp[i][1]+cphip[2]*uind[i][1]); + vzz -= 0.5 * (cphid[3]*uinp[i][2]+cphip[3]*uind[i][2]); + vxy -= 0.25 * (cphid[1]*uinp[i][1]+cphip[1]*uind[i][1] + + cphid[2]*uinp[i][0]+cphip[2]*uind[i][0]); + vyz -= 0.25 * (cphid[2]*uinp[i][2]+cphip[2]*uind[i][2] + + cphid[3]*uinp[i][1]+cphip[3]*uind[i][1]); + vxz -= 0.25 * (cphid[1]*uinp[i][2]+cphip[1]*uind[i][2] + + cphid[3]*uinp[i][0]+cphip[3]*uind[i][0]); + } + } + + + // resolve site torques then increment forces and virial + + for (i = 0; i < nlocal; i++) { + tep[0] = cmp[i][3]*cphidp[i][2] - cmp[i][2]*cphidp[i][3] + + 2.0*(cmp[i][6]-cmp[i][5])*cphidp[i][9] + cmp[i][8]*cphidp[i][7] + + cmp[i][9]*cphidp[i][5]- cmp[i][7]*cphidp[i][8] - cmp[i][9]*cphidp[i][6]; + tep[1] = cmp[i][1]*cphidp[i][3] - cmp[i][3]*cphidp[i][1] + + 2.0*(cmp[i][4]-cmp[i][6])*cphidp[i][8] + cmp[i][7]*cphidp[i][9] + + cmp[i][8]*cphidp[i][6] - cmp[i][8]*cphidp[i][4] - cmp[i][9]*cphidp[i][7]; + tep[2] = cmp[i][2]*cphidp[i][1] - cmp[i][1]*cphidp[i][2] + + 2.0*(cmp[i][5]-cmp[i][4])*cphidp[i][7] + cmp[i][7]*cphidp[i][4] + + cmp[i][9]*cphidp[i][8] - cmp[i][7]*cphidp[i][5] - cmp[i][8]*cphidp[i][9]; + + torque2force(i,tep,fix,fiy,fiz,f); + + iz = zaxis2local[i]; + ix = xaxis2local[i]; + iy = yaxis2local[i]; + + xiz = x[iz][0] - x[i][0]; + yiz = x[iz][1] - x[i][1]; + ziz = x[iz][2] - x[i][2]; + xix = x[ix][0] - x[i][0]; + yix = x[ix][1] - x[i][1]; + zix = x[ix][2] - x[i][2]; + xiy = x[iy][0] - x[i][0]; + yiy = x[iy][1] - x[i][1]; + ziy = x[iy][2] - x[i][2]; + + vxx += xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; + vyy += yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; + vzz += zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; + vxy += 0.5*(yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); + vyz += 0.5*(zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); + vxz += 0.5*(zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); + } + + // account for dipole response terms in the OPT method + + if (poltyp == OPT) { + for (i = 0; i < nlocal; i++) { + for (k = 0; k < optorder; k++) { + for (j = 1; j < 10; j++) { + fphid[i][j] = felec * fopt[i][k][j]; + fphip[i][j] = felec * foptp[i][k][j]; + } + + for (m = 0; m < optorder-k; m++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[0][j]*uopt[i][m][0] + a[1][j]*uopt[i][m][1] + + a[2][j]*uopt[i][m][2]; + fuinp[i][j] = a[0][j]*uoptp[i][m][0] + a[1][j]*uoptp[i][m][1] + + a[2][j]*uoptp[i][m][2]; + } + + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + + for (j = 0; j < 3; j++) { + j1 = deriv1[j+1]; + j2 = deriv2[j+1]; + j3 = deriv3[j+1]; + f1 += fuind[i][j]*fphip[i][j1] + fuinp[i][j]*fphid[i][j1]; + f2 += fuind[i][j]*fphip[i][j2] + fuinp[i][j]*fphid[i][j2]; + f3 += fuind[i][j]*fphip[i][j3] + fuinp[i][j]*fphid[i][j3]; + } + + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + + f[i][0] -= copm[k+m+1]*h1; + f[i][1] -= copm[k+m+1]*h2; + f[i][2] -= copm[k+m+1]*h3; + + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (j1 = 1; j1 < 4; j1++) { + cphid[j] += ftc[j][j1]*fphid[i][j1]; + cphip[j] += ftc[j][j1]*fphip[i][j1]; + } + } + + vxx -= 0.5*copm[k+m+1] * + (cphid[1]*uoptp[i][m][0] + cphip[1]*uopt[i][m][0]); + vyy -= 0.5*copm[k+m+1] * + (cphid[2]*uoptp[i][m][1]+ cphip[2]*uopt[i][m][1]); + vzz -= 0.5*copm[k+m+1] * + (cphid[3]*uoptp[i][m][2]+ cphip[3]*uopt[i][m][2]); + vxy -= 0.25*copm[k+m+1] * + (cphid[1]*uoptp[i][m][1]+ cphip[1]*uopt[i][m][1]+ + cphid[2]*uoptp[i][m][0]+ cphip[2]*uopt[i][m][0]); + vyz -= 0.25*copm[k+m+1] * + (cphid[1]*uoptp[i][m][2]+ cphip[1]*uopt[i][m][2]+ + cphid[3]*uoptp[i][m][0]+ cphip[3]*uopt[i][m][0]); + vxz -= 0.25*copm[k+m+1] * + (cphid[2]*uoptp[i][m][2]+ cphip[2]*uopt[i][m][2]+ + cphid[3]*uoptp[i][m][1]+ cphip[3]*uopt[i][m][1]); + } + } + } + } + + // account for dipole response terms in the TCG method + + /* + if (poltyp == TCG) { + + for (m = 0; m < tcgnab; m++) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[0][j]*uad[i][m][0] + a[1][j]*uad[i][m][1] + + a[2][j]*uad[i][m][2]; + fuinp[i][j] = a[0][j]*ubp[i][m][0] + a[1][j]*ubp[i][m][1] + + a[2][j]*ubp[i][m][2]; + } + } + + grid_uind(fuind,fuinp); + efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); + + for (k = 0; k < nfft3; k++) { + for (j = 0; j < nfft2; j++) { + for (i = 0; i < nfft1; i++) { + term = qfac[k][j][i]; + qgrid[k][j][i][0] *= term; + qgrid[k][j][i][1] *= term; + } + } + } + + efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1); + fphi_uind(fphid,fphip,fphidp); + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 10; j++) { + fphid[i][j] *= felec; + fphip[i][j] *= felec; + } + } + + for (i = 0; i < nlocal; i++) { + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + for (k = 0; k < 3; k++) { + j1 = deriv1[k+1]; + j2 = deriv2[k+1]; + j3 = deriv3[k+1]; + f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1]; + f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2]; + f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3]; + } + + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + f[i][0] -= h1; + f[i][1] -= h2; + f[i][2] -= h3; + + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (k = 1; k < 4; k++) { + cphid[j] += ftc[j][k]*fphid[i][k]; + cphip[j] += ftc[j][k]*fphip[i][k]; + } + } + + vxx -= 0.5*(cphid[1]*ubp[i][m][0] + cphip[1]*uad[i][m][0]); + vyy -= 0.5*(cphid[2]*ubp[i][m][1] + cphip[2]*uad[i][m][1]); + vzz -= 0.5*(cphid[3]*ubp[i][m][2] + cphip[3]*uad[i][m][2]); + + vxy -= 0.25*(cphid[1]*ubp[i][m][1] + cphip[1]*uad[i][m][1] + + cphid[2]*ubp[i][m][0] + cphip[2]*uad[i][m][0]); + vyz -= 0.25*(cphid[1]*ubp[i][m][2] + cphip[1]*uad[i][m][2] + + cphid[3]*ubp[i][m][0] + cphip[3]*uad[i][m][0]); + vxz -= 0.25*(cphid[2]*ubp[i][m][2] + cphip[2]*uad[i][m][2] + + cphid[3]*ubp[i][m][1] + cphip[3]*uad[i][m][1]); + } + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[0][j]*ubd[i][m][0] + a[1][j]*ubd[i][m][1] + + a[2][j]*ubd[i][m][2]; + fuinp[i][j] = a[0][j]*uap[i][m][0] + a[1][j]*uap[i][m][1] + + a[2][j]*uap[i][m][2]; + } + } + + grid_uind(fuind,fuinp); + efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); + + for (k = 0; k < nfft3; k++) { + for (j = 0; j < nfft2; j++) { + for (i = 0; i < nfft1; i++) { + term = qfac[k][j][i]; + qgrid[k][j][i][0] *= term; + qgrid[k][j][i][1] *= term; + } + } + } + + efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1); + fphi_uind(fphid,fphip,fphidp); + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 10; j++) { + fphid[i][j] *= felec; + fphip[i][j] *= felec; + } + } + + for (i = 0; i < nlocal; i++) { + f1 = 0.0; + f2 = 0.0; + f3 = 0.0; + for (k = 0; k < 3; k++) { + j1 = deriv1[k+1]; + j2 = deriv2[k+1]; + j3 = deriv3[k+1]; + f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1]; + f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2]; + f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3]; + } + + f1 *= 0.5 * nfft1; + f2 *= 0.5 * nfft2; + f3 *= 0.5 * nfft3; + h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; // matvec + h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; + h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; + f[i][0] -= h1; + f[i][1] -= h2; + f[i][2] -= h3; + + for (j = 1; j < 4; j++) { + cphid[j] = 0.0; + cphip[j] = 0.0; + for (k = 1; k < 4; k++) { + cphid[j] += ftc[j][k]*fphid[i][k]; + cphip[j] += ftc[j][k]*fphip[i][k]; + } + } + + vxx -= 0.5*(cphid[1]*uap[i][m][0] + cphip[1]*ubd[i][m][0]); + vyy -= 0.5*(cphid[2]*uap[i][m][1] + cphip[2]*ubd[i][m][1]); + vzz -= 0.5*(cphid[3]*uap[i][m][2] + cphip[3]*ubd[i][m][2]); + vxy -= 0.25*(cphid[1]*uap[i][m][1] + cphip[1]*ubd[i][m][1] + + cphid[2]*uap[i][m][0] + cphip[2]*ubd[i][m][0]); + vxz -= 0.25*(cphid[1]*uap[i][m][2] + cphip[1]*ubd[i][m][2] + + cphid[3]*uap[i][m][0] + cphip[3]*ubd[i][m][0]); + vyz -= 0.25*(cphid[2]*uap[i][m][2] + cphip[2]*ubd[i][m][2] + + cphid[3]*uap[i][m][1] + cphip[3]*ubd[i][m][1]); + } + } + } + */ + + // assign permanent and induced multipoles to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] += uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + double ***gridpre = (double ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + gridfft = p_kspace->pre_convolution(); + + // gridfft1 = copy of first FFT + + int nfft_owned = p_kspace->nfft_owned; + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); + + // assign induced dipoles to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] += uind[i][j-1] - uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + gridpre = (double ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors + + double *gridfft2 = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + expterm = exp(term) / denom; + struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx += h1*h1*vterm - eterm; + vyy += h2*h2*vterm - eterm; + vzz += h3*h3*vterm - eterm; + vxy += h1*h2*vterm; + vyz += h2*h3*vterm; + vxz += h1*h3*vterm; + } + n += 2; + } + } + } + + // assign only the induced dipoles to the PME grid + // and perform the 3-D FFT forward transformation + // NOTE: why is there no inverse FFT in this section? + + if (poltyp == DIRECT || poltyp == TCG) { + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + cmp[i][j] = 0.0; + for (j = 1; j < 4; j++) + cmp[i][j] = uinp[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + // zeroed by zero() + + double ***gridpre = (double ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + double *gridfft = p_kspace->pre_convolution(); + + // gridfft1 = copy of first FFT + + int nfft_owned = p_kspace->nfft_owned; + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double)); + + // assign ??? to the PME grid + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] = uind[i][j-1]; + } + + // convert Cartesian multipoles to fractional multipoles + + cmp_to_fmp(cmp,fmp); + + // gridpre = my portion of 3d grid in brick decomp w/ ghost values + + gridpre = (double ***) p_kspace->zero(); + + // map atoms to grid + + grid_mpole(fmp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector + + double *gridfft2 = p_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + m = n = 0; + for (k = nzlo; k <= nzhi; k++) { + for (j = nylo; j <= nyhi; j++) { + for (i = nxlo; i <= nxhi; i++) { + r1 = (i >= nhalf1) ? i-nfft1 : i; + r2 = (j >= nhalf2) ? j-nfft2 : j; + r3 = (k >= nhalf3) ? k-nfft3 : k; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; + expterm = exp(term) / denom; + struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + vxx += h1*h1*vterm - eterm; + vyy += h2*h2*vterm - eterm; + vzz += h3*h3*vterm - eterm; + vxy += h1*h2*vterm; + vyz += h2*h3*vterm; + vxz += h1*h3*vterm; + } + n += 2; + } + } + } + } + + // add back missing terms for the TCG polarization method; + // first do the term for "UAD" dotted with "UBP" + + /* + if (poltyp == TCG) { + + for (m = 0; m < tcgnab; m++) { + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + cmp[i][j] = 0.0; + for (j = 1; j < 4; j++) + cmp[i][j] = ubp[i][m][j-1]; + } + + cmp_to_fmp(cmp,fmp); + grid_mpole(fmp); + efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); + + for (k = 0; k < nfft3; k++) { + for (j = 0; j < nfft2; j++) { + for (i = 0; i < nfft1; i++) { + qgrip[k][j][i][0] = qgrid[k][j][i][0]; + qgrip[k][j][i][1] = qgrid[k][j][i][1]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] = uad[i][m][j-1]; + } + + cmp_to_fmp(cmp,fmp); + grid_mpole(fmp); + efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); + + // make the scalar summation over reciprocal lattice + // NOTE: this loop has to be distributed for parallel + // NOTE: why does this one include m = 0 ? + + for (m = 1; m < ntot; m++) { + k1 = m % nfft1; + k2 = (m % nff) / nfft1; + k3 = m/nff; + r1 = (k1 >= nf1) ? k1-nfft1 : k1; + r2 = (k2 >= nf2) ? k2-nfft2 : k2; + r3 = (k3 >= nf3) ? k3-nfft3 : k3; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3]; + expterm = exp(term) / denom; + struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] + + qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + virpolar[0] -= h1*h1*vterm - eterm; + virpolar[1] -= h2*h2*vterm - eterm; + virpolar[2] -= h3*h3*vterm - eterm; + virpolar[3] -= h1*h2*vterm; + virpolar[4] -= h1*h3*vterm; + virpolar[5] -= h2*h3*vterm; + } + } + + // now do the TCG terms with "UBD" dotted with "UAP" + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 10; j++) + cmp[i][j] = 0.0; + for (j = 1; j < 4; j++) + cmp[i][j] = uap[i][m][j-1]; + } + + cmp_to_fmp(cmp,fmp); + grid_mpole(fmp); + efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); + + for (k = 0; k < nfft3; k++) { + for (j = 0; j < nfft2; j++) { + for (i = 0; i < nfft1; i++) { + qgrip[k][j][i][0] = qgrid[k][j][i][0]; + qgrip[k][j][i][1] = qgrid[k][j][i][1]; + } + } + } + + for (i = 0; i < nlocal; i++) { + for (j = 1; j < 4; j++) + cmp[i][j] = ubd[i][m][j-1]; + } + + cmp_to_fmp(cmp,fmp); + grid_mpole(fmp); + efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); + + // make the scalar summation over reciprocal lattice + // NOTE: this loop has to be distributed for parallel + // NOTE: why does this one include m = 0 ? + + for (m = 1; m < ntot; m++) { + k1 = m % nfft1; + k2 = (m % nff) / nfft1; + k3 = m/nff; + r1 = (k1 >= nf1) ? k1-nfft1 : k1; + r2 = (k2 >= nf2) ? k2-nfft2 : k2; + r3 = (k3 >= nf3) ? k3-nfft3 : k3; + h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; + h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; + h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; + hsq = h1*h1 + h2*h2 + h3*h3; + term = -pterm * hsq; + expterm = 0.0; + if (term > -50.0 && hsq != 0.0) { + denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3]; + expterm = exp(term) / denom; + struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] + + qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1]; + eterm = 0.5 * felec * expterm * struc2; + vterm = (2.0/hsq) * (1.0-term) * eterm; + virpolar[0] -= h1*h1*vterm - eterm; + virpolar[1] -= h2*h2*vterm - eterm; + virpolar[2] -= h3*h3*vterm - eterm; + virpolar[3] -= h1*h2*vterm; + virpolar[4] -= h1*h3*vterm; + virpolar[5] -= h2*h3*vterm; + } + } + } + } + */ + + // increment the total internal virial tensor components + + if (vflag_global) { + virpolar[0] -= vxx; + virpolar[1] -= vyy; + virpolar[2] -= vzz; + virpolar[3] -= vxy; + virpolar[4] -= vxz; + virpolar[5] -= vyz; + } +} + /* ---------------------------------------------------------------------- compute atom forces from torques ------------------------------------------------------------------------- */ diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index fe6ed3368f..77b594177b 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -43,6 +43,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void umutual2b(double **, double **); virtual void ufield0c(double **, double **); virtual void polar_real(); + virtual void polar_kspace(); private: int gpu_mode; From 62ecf98cda4d1bd970b7bf1b5e8f1a09c388d009 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 16 Sep 2022 14:47:16 -0500 Subject: [PATCH 118/181] Enabled fphi_uind in hippo/gpu, really need to refactor hippo and amoeba in the GPU lib to remove kernel duplicates --- lib/gpu/Nvidia.makefile | 26 +-- lib/gpu/lal_amoeba.cpp | 3 +- lib/gpu/lal_amoeba.cu | 2 +- lib/gpu/lal_base_amoeba.cpp | 24 +-- lib/gpu/lal_base_amoeba.h | 10 +- lib/gpu/lal_hippo.cpp | 3 +- lib/gpu/lal_hippo.cu | 301 ++++++++++++++++++++++++++++++++++ lib/gpu/lal_hippo_ext.cpp | 14 ++ src/GPU/pair_amoeba_gpu.cpp | 2 +- src/GPU/pair_hippo_gpu.cpp | 311 ++++++++++++++++++++++++++++++++---- src/GPU/pair_hippo_gpu.h | 6 + 11 files changed, 626 insertions(+), 76 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index c52246b06b..5f50486e28 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -68,31 +68,7 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) # host code compilation -$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) - $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) - $(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) - $(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) - $(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) - $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) - $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) - $(CUDR) -o $@ -c $< -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) - $(CUDR) -o $@ -c $< -I$(OBJ_DIR) - -$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H) +$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 48316e9b6e..02870ea861 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -64,7 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, cell_size,gpu_split,_screen,amoeba, "k_amoeba_multipole", "k_amoeba_udirect2b", "k_amoeba_umutual2b", "k_amoeba_polar", - "k_amoeba_short_nbor", "k_amoeba_special15"); + "k_amoeba_fphi_uind", "k_amoeba_short_nbor", + "k_amoeba_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index d391279f5d..66926721cb 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1626,7 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ -__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1, +__kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 3ee0517dfb..eac704fbfc 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -65,6 +65,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const char *k_name_udirect2b, const char *k_name_umutual2b, const char *k_name_polar, + const char *k_name_fphi_uind, const char *k_name_short_nbor, const char* k_name_special15) { screen=_screen; @@ -100,7 +101,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program,k_name_multipole, k_name_udirect2b, k_name_umutual2b,k_name_polar, - k_name_short_nbor, k_name_special15); + k_name_fphi_uind, k_name_short_nbor, k_name_special15); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -934,6 +935,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, + const char *kname_fphi_uind, const char *kname_short_nbor, const char* kname_special15) { if (_compiled) @@ -942,17 +944,17 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); std::string oclstring = device->compile_string()+" -DEVFLAG=1"; - pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); + pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen); - k_multipole.set_function(*pair_program,kname_multipole); - k_udirect2b.set_function(*pair_program,kname_udirect2b); - k_umutual2b.set_function(*pair_program,kname_umutual2b); - k_polar.set_function(*pair_program,kname_polar); - k_fphi_uind.set_function(*pair_program,"k_fphi_uind"); - k_short_nbor.set_function(*pair_program,kname_short_nbor); - k_special15.set_function(*pair_program,kname_special15); - pos_tex.get_texture(*pair_program,"pos_tex"); - q_tex.get_texture(*pair_program,"q_tex"); + k_multipole.set_function(*pair_program, kname_multipole); + k_udirect2b.set_function(*pair_program, kname_udirect2b); + k_umutual2b.set_function(*pair_program, kname_umutual2b); + k_polar.set_function(*pair_program, kname_polar); + k_fphi_uind.set_function(*pair_program, kname_fphi_uind); + k_short_nbor.set_function(*pair_program, kname_short_nbor); + k_special15.set_function(*pair_program, kname_special15); + pos_tex.get_texture(*pair_program, "pos_tex"); + q_tex.get_texture(*pair_program, "q_tex"); _compiled=true; diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 802b6962b7..5aeb729993 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -62,9 +62,10 @@ class BaseAmoeba { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, - const char *kname_multipole, - const char *kname_udirect2b, const char *kname_umutual2b, - const char *kname_polar, const char *kname_short_nbor, const char* kname_special15); + const char *kname_multipole, const char *kname_udirect2b, + const char *kname_umutual2b, const char *kname_polar, + const char *kname_fphi_uind, const char *kname_short_nbor, + const char* kname_special15); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -309,7 +310,8 @@ class BaseAmoeba { void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, - const char *kname_short_nbor, const char* kname_special15); + const char *kname_fphi_uind, const char *kname_short_nbor, + const char* kname_special15); virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 79a8772c3e..9917ab91a2 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -67,7 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, cell_size,gpu_split,_screen,hippo, "k_hippo_multipole", "k_hippo_udirect2b", "k_hippo_umutual2b", "k_hippo_polar", - "k_hippo_short_nbor", "k_hippo_special15"); + "k_hippo_fphi_uind", "k_hippo_short_nbor", + "k_hippo_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index be8d2c0701..dde8f9bfd5 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -2045,6 +2045,307 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +__kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, + const __global numtyp4 *restrict thetai2, + const __global numtyp4 *restrict thetai3, + const __global int *restrict igrid, + const __global numtyp *restrict grid, + __global numtyp *restrict fdip_phi1, + __global numtyp *restrict fdip_phi2, + __global numtyp *restrict fdip_sum_phi, + const int bsorder, const int inum, + const int nzlo_out, const int nylo_out, + const int nxlo_out, const int ngridxy, + const int ngridx) +{ + //int tid, ii, offset, i, n_stride; + //atom_info(t_per_atom,ii,tid,offset); + + + int tid=THREAD_ID_X; + int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; + + if (iimodify, lmp->error); } @@ -198,6 +213,16 @@ void PairHippoGPU::init_style() tq_single = false; else tq_single = true; + + // replace with the gpu counterpart + + if (gpu_umutual1_ready) { + if (use_ewald && ic_kspace) { + delete ic_kspace; + ic_kspace = + new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); + } + } } /* ---------------------------------------------------------------------- @@ -392,6 +417,8 @@ void PairHippoGPU::induce() int debug = 1; + first_induce_iteration = true; + // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -403,8 +430,6 @@ void PairHippoGPU::induce() // owned atoms - double **x = atom->x; - double **f = atom->f; int nlocal = atom->nlocal; // zero out the induced dipoles at each site @@ -996,37 +1021,60 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) int i,j; double term; + double time0,time1,time2; + // zero field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; int nall = nlocal + atom->nghost; - for (i = 0; i < nall; i++) { - for (j = 0; j < 3; j++) { + memset(&field[0][0], 0, 3*nall *sizeof(double)); + memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); + +/* + for (int i = 0; i < nall; i++) { + for (int j = 0; j < 3; j++) { field[i][j] = 0.0; fieldp[i][j] = 0.0; } } - +*/ + // get the real space portion of the mutual field first + MPI_Barrier(world); + time0 = MPI_Wtime(); + if (polar_rspace_flag) umutual2b(field,fieldp); + time1 = MPI_Wtime(); // get the reciprocal space part of the mutual field if (polar_kspace_flag) umutual1(field,fieldp); + time2 = MPI_Wtime(); // add the self-energy portion of the mutual field term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; + for (int i = 0; i < nlocal; i++) { + field[i][0] += term*uind[i][0]; + field[i][1] += term*uind[i][1]; + field[i][2] += term*uind[i][2]; + } + for (int i = 0; i < nlocal; i++) { + fieldp[i][0] += term*uinp[i][0]; + fieldp[i][1] += term*uinp[i][1]; + fieldp[i][2] += term*uinp[i][2]; + } +/* for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { field[i][j] += term*uind[i][j]; fieldp[i][j] += term*uinp[i][j]; } } - - // accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU +*/ + // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU // field and fieldp may already have some nonzero values from kspace (umutual1 and self) hippo_gpu_update_fieldp(&fieldp_pinned); @@ -1049,6 +1097,228 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } + + // accumulate timing information + + time_mutual_rspace += time1 - time0; + time_mutual_kspace += time2 - time1; +} + +/* ---------------------------------------------------------------------- + umutual1 = Ewald recip mutual induced field + umutual1 computes the reciprocal space contribution of the + induced atomic dipole moments to the field +------------------------------------------------------------------------- */ + +void PairHippoGPU::umutual1(double **field, double **fieldp) +{ + int m,n; + int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; + double term; + double a[3][3]; // indices not flipped vs Fortran + + // return if the Ewald coefficient is zero + + if (aewald < 1.0e-6) return; + + // convert Cartesian dipoles to fractional coordinates + + for (int j = 0; j < 3; j++) { + a[0][j] = nfft1 * recip[0][j]; + a[1][j] = nfft2 * recip[1][j]; + a[2][j] = nfft3 * recip[2][j]; + } + + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) { + fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; + fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; + fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; + } + + for (int i = 0; i < nlocal; i++) { + fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; + fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; + fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; + } +/* + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; + fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; + } + } +*/ + // gridpre = my portion of 4d grid in brick decomp w/ ghost values + + double ****gridpre = (double ****) ic_kspace->zero(); + + // map 2 values to grid + + grid_uind(fuind,fuinp,gridpre); + + // pre-convolution operations including forward FFT + // gridfft = my portion of complex 3d grid in FFT decomposition + + double *gridfft = ic_kspace->pre_convolution(); + + // --------------------- + // convolution operation + // --------------------- + + nxlo = ic_kspace->nxlo_fft; + nxhi = ic_kspace->nxhi_fft; + nylo = ic_kspace->nylo_fft; + nyhi = ic_kspace->nyhi_fft; + nzlo = ic_kspace->nzlo_fft; + nzhi = ic_kspace->nzhi_fft; + + // use qfac values stored in udirect1() + + m = n = 0; + for (int k = nzlo; k <= nzhi; k++) { + for (int j = nylo; j <= nyhi; j++) { + for (int i = nxlo; i <= nxhi; i++) { + term = qfac[m++]; + gridfft[n] *= term; + gridfft[n+1] *= term; + n += 2; + } + } + } + + // post-convolution operations including backward FFT + // gridppost = my portion of 4d grid in brick decomp w/ ghost values + + double ****gridpost = (double ****) ic_kspace->post_convolution(); + + // get potential + double time0, time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + + fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); + + time1 = MPI_Wtime(); + time_fphi_uind += (time1 - time0); + + // store fractional reciprocal potentials for OPT method + + if (poltyp == OPT) { + for (int i = 0; i < nlocal; i++) { + for (int j = 0; j < 10; j++) { + fopt[i][optlevel][j] = fdip_phi1[i][j]; + foptp[i][optlevel][j] = fdip_phi2[i][j]; + } + } + } + + // convert the dipole fields from fractional to Cartesian + + for (int i = 0; i < 3; i++) { + a[0][i] = nfft1 * recip[0][i]; + a[1][i] = nfft2 * recip[1][i]; + a[2][i] = nfft3 * recip[2][i]; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi1[i][1] + + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; + double dfy = a[1][0]*fdip_phi1[i][1] + + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; + double dfz = a[2][0]*fdip_phi1[i][1] + + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; + field[i][0] -= dfx; + field[i][1] -= dfy; + field[i][2] -= dfz; + } + + for (int i = 0; i < nlocal; i++) { + double dfx = a[0][0]*fdip_phi2[i][1] + + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; + double dfy = a[1][0]*fdip_phi2[i][1] + + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; + double dfz = a[2][0]*fdip_phi2[i][1] + + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; + fieldp[i][0] -= dfx; + fieldp[i][1] -= dfy; + fieldp[i][2] -= dfz; + } +/* + for (int i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] + + a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3]; + dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] + + a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3]; + } + } + + // increment the field at each multipole site + + for (i = 0; i < nlocal; i++) { + for (j = 0; j < 3; j++) { + field[i][j] -= dipfield1[i][j]; + fieldp[i][j] -= dipfield2[i][j]; + } + } +*/ +} + +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) +{ + if (!gpu_fphi_uind_ready) { + PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); + return; + } + + void* fdip_phi1_pinned = nullptr; + void* fdip_phi2_pinned = nullptr; + void* fdip_sum_phi_pinned = nullptr; + hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, + thetai2, thetai3, igrid, grid, + &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out, + first_induce_iteration); + + int nlocal = atom->nlocal; + double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } } /* ---------------------------------------------------------------------- @@ -1089,29 +1359,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) double *pval = atom->dvector[index_pval]; hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval, aewald, off2, &fieldp_pinned); -/* - // accumulate the field and fieldp values from the GPU lib - // field and fieldp may already have some nonzero values from kspace (umutual1) - - int nlocal = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; - - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } - - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; - } -*/ } /* ---------------------------------------------------------------------- diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index 1ed1c3299d..742fbfb119 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -39,6 +39,8 @@ class PairHippoGPU : public PairAmoeba { virtual void dispersion_real(); virtual void multipole_real(); virtual void udirect2b(double **, double **); + virtual void umutual1(double **, double **); + virtual void fphi_uind(double ****, double **, double **, double **); virtual void umutual2b(double **, double **); virtual void ufield0c(double **, double **); virtual void polar_real(); @@ -55,9 +57,13 @@ class PairHippoGPU : public PairAmoeba { bool gpu_dispersion_real_ready; bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; + bool gpu_umutual1_ready; + bool gpu_fphi_uind_ready; bool gpu_umutual2b_ready; bool gpu_polar_real_ready; + bool first_induce_iteration; + void udirect2b_cpu(); template From f9f777b099902e40a7880ab13f44e609fd1bb975 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 18 Sep 2022 15:09:26 -0500 Subject: [PATCH 119/181] Refactored precompute_induce to overlap data transfers with kernel launches --- lib/gpu/lal_amoeba_ext.cpp | 11 +++++++++++ lib/gpu/lal_base_amoeba.cpp | 15 +++------------ lib/gpu/lal_hippo_ext.cpp | 11 +++++++++++ src/GPU/pair_amoeba_gpu.cpp | 13 +++++++++++++ src/GPU/pair_hippo_gpu.cpp | 13 +++++++++++++ 5 files changed, 51 insertions(+), 12 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index f91b76f688..425caaabbb 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,6 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } +void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); +} + void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, double ****host_grid_brick, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index eac704fbfc..304e23274f 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -580,7 +580,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); - _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); @@ -674,7 +674,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, double ****host_grid_brick, - void** host_fdip_phi1, + void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, const int nzlo_out, const int nzhi_out, @@ -682,16 +682,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, const int nxlo_out, const int nxhi_out, bool& first_iteration) { - // allocation/resize and transfers before the first iteration - - if (first_iteration) { - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, host_igrid, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out); - first_iteration = false; - } - - // TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly + // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly UCL_H_Vec hdummy; hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 6b189defe9..2cc17c6ced 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -193,6 +193,17 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double ** eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } +void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { + HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); +} + void hippo_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, double ****host_grid_brick, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 267dc666d6..5770d9542d 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -88,6 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void amoeba_gpu_update_fieldp(void **fieldp_ptr); +void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, @@ -294,6 +301,12 @@ void PairAmoebaGPU::induce() first_induce_iteration = true; + amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, + thetai2, thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 8c1b380f65..9317b11794 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -105,6 +105,13 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void hippo_gpu_update_fieldp(void **fieldp_ptr); +void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); + void hippo_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, @@ -419,6 +426,12 @@ void PairHippoGPU::induce() first_induce_iteration = true; + hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, + thetai2, thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() From caa66d904ecd6aa7fd0c0b4f04c517cb27e8b319 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 18 Sep 2022 15:54:12 -0500 Subject: [PATCH 120/181] Cleaned up GPU lib functions --- lib/gpu/lal_amoeba_ext.cpp | 16 ++++----------- lib/gpu/lal_base_amoeba.cpp | 11 ++--------- lib/gpu/lal_base_amoeba.h | 19 +++++++----------- lib/gpu/lal_hippo_ext.cpp | 15 +++----------- src/GPU/pair_amoeba_gpu.cpp | 37 +++++++++++++---------------------- src/GPU/pair_amoeba_gpu.h | 2 -- src/GPU/pair_hippo_gpu.cpp | 39 +++++++++++++------------------------ src/GPU/pair_hippo_gpu.h | 2 -- 8 files changed, 43 insertions(+), 98 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 425caaabbb..42384cf7de 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -173,18 +173,10 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, nylo_out, nyhi_out, nxlo_out, nxhi_out); } -void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, double ****host_grid_brick, - void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration) { - AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, host_grid_brick, host_fdip_phi1, - host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi); } void amoeba_setup_fft(const int numel, const int element_type) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 304e23274f..e3da81762e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -670,17 +670,10 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** host_igrid, - double ****host_grid_brick, +void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, - void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration) + void **host_fdip_sum_phi) { // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly UCL_H_Vec hdummy; diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 5aeb729993..a88a63e870 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -151,13 +151,6 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - virtual void precompute_induce(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out); - /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -180,15 +173,17 @@ class BaseAmoeba { double **host_uind, double **host_uinp, double *host_pval, const double aewald, const double off2_polar, void **fieldp_ptr); - virtual void compute_fphi_uind(const int inum_full, const int bsorder, + /// Allocate/resize per-atom arrays before induce() + virtual void precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double ****host_grid_brick, - void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration); + const int nxlo_out, const int nxhi_out); + + virtual void compute_fphi_uind(double ****host_grid_brick, + void **host_fdip_phi1, void **host_fdip_phi2, + void **host_fdip_sum_phi); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 2cc17c6ced..1bd6bade3a 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -204,18 +204,9 @@ void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, nylo_out, nyhi_out, nxlo_out, nxhi_out); } -void hippo_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, double ****host_grid_brick, - void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration) { - HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, host_grid_brick, host_fdip_phi1, - host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi); } double hippo_gpu_bytes() { diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 5770d9542d..e5cdc281b9 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -95,15 +95,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out); -void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double ****host_grid_brick, void **host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration); +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -299,13 +292,6 @@ void PairAmoebaGPU::induce() int debug = 1; - first_induce_iteration = true; - - amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out); // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -351,6 +337,15 @@ void PairAmoebaGPU::induce() } } + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + + amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating // uopt,uoptp with a optorder+1 dimension, just optorder ?? @@ -1160,14 +1155,8 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_phi1_pinned = nullptr; void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; - amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, grid, - &fdip_phi1_pinned, &fdip_phi2_pinned, - &fdip_sum_phi_pinned, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out, - first_induce_iteration); + amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); int nlocal = atom->nlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 77b594177b..420874df21 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -62,8 +62,6 @@ class PairAmoebaGPU : public PairAmoeba { bool gpu_umutual2b_ready; bool gpu_polar_real_ready; - bool first_induce_iteration; - void udirect2b_cpu(); template diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 9317b11794..1151027993 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -112,15 +112,8 @@ void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out); -void hippo_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double ****host_grid_brick, void **host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration); +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, @@ -424,14 +417,6 @@ void PairHippoGPU::induce() int debug = 1; - first_induce_iteration = true; - - hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out); - // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -486,6 +471,16 @@ void PairHippoGPU::induce() udirp[i][0], udirp[i][1], udirp[i][2]); } */ + + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + + hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating // uopt,uoptp with a optorder+1 dimension, just optorder ?? @@ -1296,14 +1291,8 @@ void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_phi1_pinned = nullptr; void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; - hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, grid, - &fdip_phi1_pinned, &fdip_phi2_pinned, - &fdip_sum_phi_pinned, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out, - first_induce_iteration); + hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); int nlocal = atom->nlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index 742fbfb119..b1b908411d 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -62,8 +62,6 @@ class PairHippoGPU : public PairAmoeba { bool gpu_umutual2b_ready; bool gpu_polar_real_ready; - bool first_induce_iteration; - void udirect2b_cpu(); template From 356c46c9139e10e7a14864efbd2f4a007b0289c1 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 18 Sep 2022 16:28:30 -0500 Subject: [PATCH 121/181] Replaced mem allocation/deallocation inside moduli() with using member variables and mem resize if needed --- lib/gpu/lal_amoeba.cu | 179 +++++++++++++++++++++++++++++++++++ src/AMOEBA/amoeba_kspace.cpp | 28 +++--- src/AMOEBA/pair_amoeba.cpp | 95 ++++++++++--------- src/AMOEBA/pair_amoeba.h | 14 ++- 4 files changed, 259 insertions(+), 57 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 66926721cb..da5c6f0c3c 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1922,6 +1922,185 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, } } + +/* ---------------------------------------------------------------------- + fphi_mpole = multipole potential from grid + fphi_mpole extracts the permanent multipole potential from + the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1, + const __global numtyp4 *restrict thetai2, + const __global numtyp4 *restrict thetai3, + const __global int *restrict igrid, + const __global numtyp *restrict grid, + __global numtyp *restrict fphi, + const int bsorder, const int inum, + const int nzlo_out, const int nylo_out, + const int nxlo_out, const int ngridxy, + const int ngridx) +{ + int tid=THREAD_ID_X; + int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; + + if (ii _nfft_max) { + memory->destroy(_moduli_bsarray); + _nfft_max = maxfft; + memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray"); + } + // compute and load the moduli values double x = 0.0; - bspline(x,bsorder,array); + //bspline(x,bsorder,array); + bspline(x,bsorder,_moduli_array); - for (i = 0; i < maxfft; i++) bsarray[i] = 0.0; - for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i]; + for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0; + for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i]; - dftmod(bsmod1,bsarray,nfft1,bsorder); - dftmod(bsmod2,bsarray,nfft2,bsorder); - dftmod(bsmod3,bsarray,nfft3,bsorder); + dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder); + dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder); + dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder); // perform deallocation of local arrays - delete[] array; - delete[] bsarray; + //delete[] array; + //delete[] bsarray; } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 9890904e42..d301a86cdb 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -68,67 +68,71 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp) // force field settings nmax = 0; - xaxis2local = yaxis2local = zaxis2local = NULL; - rpole = NULL; - tq = NULL; + xaxis2local = yaxis2local = zaxis2local = nullptr; + rpole = nullptr; + tq = nullptr; - red2local = NULL; - xred = NULL; + red2local = nullptr; + xred = nullptr; - uind = uinp = udirp = NULL; - uopt = uoptp = NULL; - fopt = foptp = NULL; - field = fieldp = NULL; - ufld = dufld = NULL; - rsd = rsdp = NULL; - zrsd = zrsdp = NULL; + uind = uinp = udirp = nullptr; + uopt = uoptp = nullptr; + fopt = foptp = nullptr; + field = fieldp = nullptr; + ufld = dufld = nullptr; + rsd = rsdp = nullptr; + zrsd = zrsdp = nullptr; - cmp = fmp = NULL; - cphi = fphi = NULL; + cmp = fmp = nullptr; + cphi = fphi = nullptr; - poli = NULL; - conj = conjp = NULL; - vec = vecp = NULL; - udir = usum = usump = NULL; + _moduli_array = nullptr; + _moduli_bsarray = nullptr; + _nfft_max = 0; - fuind = fuinp = NULL; - fdip_phi1 = fdip_phi2 = fdip_sum_phi = NULL; - dipfield1 = dipfield2 = NULL; + poli = nullptr; + conj = conjp = nullptr; + vec = vecp = nullptr; + udir = usum = usump = nullptr; - fphid = fphip = NULL; - fphidp = cphidp = NULL; + fuind = fuinp = nullptr; + fdip_phi1 = fdip_phi2 = fdip_sum_phi = nullptr; + dipfield1 = dipfield2 = nullptr; + + fphid = fphip = nullptr; + fphidp = cphidp = nullptr; bsordermax = 0; - thetai1 = thetai2 = thetai3 = NULL; - bsmod1 = bsmod2 = bsmod3 = NULL; - bsbuild = NULL; - igrid = NULL; - m_kspace = p_kspace = pc_kspace = d_kspace = NULL; - i_kspace = ic_kspace = NULL; + thetai1 = thetai2 = thetai3 = nullptr; + bsmod1 = bsmod2 = bsmod3 = nullptr; + bsbuild = nullptr; + igrid = nullptr; + m_kspace = p_kspace = pc_kspace = d_kspace = nullptr; + i_kspace = ic_kspace = nullptr; - numneigh_dipole = NULL; - firstneigh_dipole = NULL; - firstneigh_dipdip = NULL; - ipage_dipole = NULL; - dpage_dipdip = NULL; + numneigh_dipole = nullptr; + firstneigh_dipole = nullptr; + firstneigh_dipdip = nullptr; + ipage_dipole = nullptr; + dpage_dipdip = nullptr; - numneigh_precond = NULL; - firstneigh_precond = NULL; - ipage_precond = NULL; + numneigh_precond = nullptr; + firstneigh_precond = nullptr; + ipage_precond = nullptr; - firstneigh_pcpc = NULL; - dpage_pcpc = NULL; + firstneigh_pcpc = nullptr; + dpage_pcpc = nullptr; - qfac = NULL; - gridfft1 = NULL; + qfac = nullptr; + gridfft1 = nullptr; initialize_type_class(); initialize_vdwl(); initialize_smallsize(); - forcefield = NULL; + forcefield = nullptr; - id_pole = id_udalt = id_upalt = NULL; + id_pole = id_udalt = id_upalt = nullptr; nualt = 0; first_flag = 1; @@ -220,6 +224,9 @@ PairAmoeba::~PairAmoeba() memory->destroy(fphidp); memory->destroy(cphidp); + memory->destroy(_moduli_array); + memory->destroy(_moduli_bsarray); + memory->destroy(thetai1); memory->destroy(thetai2); memory->destroy(thetai3); @@ -2312,6 +2319,8 @@ void PairAmoeba::grow_local() firstneigh_pcpc = (double **) memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc"); } + + memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array"); } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 24ce6fcfbc..91ec8faf0c 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -337,7 +337,11 @@ class PairAmoeba : public Pair { double *gridfft1; // copy of p_kspace FFT grid double **cmp,**fmp; // Cartesian and fractional multipoles - double **cphi,**fphi; + double **cphi,**fphi; + + double *_moduli_array; // buffers for moduli + double *_moduli_bsarray; + int _nfft_max; // params for current KSpace solve and FFT being worked on @@ -347,8 +351,12 @@ class PairAmoeba : public Pair { double ctf[10][10]; // indices NOT flipped vs Fortran double ftc[10][10]; // indices NOT flipped vs Fortran - class AmoebaConvolution *m_kspace,*p_kspace,*pc_kspace,*d_kspace; - class AmoebaConvolution *i_kspace,*ic_kspace; + class AmoebaConvolution *m_kspace; // multipole KSpace + class AmoebaConvolution *p_kspace; // polar KSpace + class AmoebaConvolution *pc_kspace; + class AmoebaConvolution *d_kspace; // dispersion KSpace + class AmoebaConvolution *i_kspace; // induce KSpace + class AmoebaConvolution *ic_kspace; // FFT grid size factors From 785131932c87e0575d336f5b296cbce5731f13b6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 20 Sep 2022 13:58:17 -0500 Subject: [PATCH 122/181] Added fphi_mpole in amoeba/gpu, fixed a bug in the kernel when indexing grid --- lib/gpu/lal_amoeba.cpp | 4 +- lib/gpu/lal_amoeba.cu | 2 +- lib/gpu/lal_amoeba_ext.cpp | 4 + lib/gpu/lal_base_amoeba.cpp | 74 ++++++++++++++- lib/gpu/lal_base_amoeba.h | 14 ++- lib/gpu/lal_hippo.cpp | 4 +- lib/gpu/lal_hippo.cu | 178 ++++++++++++++++++++++++++++++++++++ src/GPU/pair_amoeba_gpu.cpp | 37 +++++++- 8 files changed, 300 insertions(+), 17 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 02870ea861..7be4a6f59c 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -64,8 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, cell_size,gpu_split,_screen,amoeba, "k_amoeba_multipole", "k_amoeba_udirect2b", "k_amoeba_umutual2b", "k_amoeba_polar", - "k_amoeba_fphi_uind", "k_amoeba_short_nbor", - "k_amoeba_special15"); + "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole", + "k_amoeba_short_nbor", "k_amoeba_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index da5c6f0c3c..6f77fb932f 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -2026,7 +2026,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1, for (int ib = 0; ib < bsorder; ib++) { int i1 = istart + ib; numtyp4 tha1 = thetai1[i1]; - int gidx = 2*(k*ngridxy + j*ngridx + i); + int gidx = k*ngridxy + j*ngridx + i; numtyp tq = grid[gidx]; t0 += tq*tha1.x; t1 += tq*tha1.y; diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 42384cf7de..1f56fa86f8 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -179,6 +179,10 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi); } +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) { + AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi); +} + void amoeba_setup_fft(const int numel, const int element_type) { AMOEBAMF.setup_fft(numel, element_type); } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index e3da81762e..08dcd8123e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -38,6 +38,7 @@ BaseAmoebaT::~BaseAmoeba() { k_udirect2b.clear(); k_umutual2b.clear(); k_fphi_uind.clear(); + k_fphi_mpole.clear(); k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); @@ -66,6 +67,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const char *k_name_umutual2b, const char *k_name_polar, const char *k_name_fphi_uind, + const char *k_name_fphi_mpole, const char *k_name_short_nbor, const char* k_name_special15) { screen=_screen; @@ -100,8 +102,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program,k_name_multipole, - k_name_udirect2b, k_name_umutual2b,k_name_polar, - k_name_fphi_uind, k_name_short_nbor, k_name_special15); + k_name_udirect2b, k_name_umutual2b,k_name_polar, + k_name_fphi_uind, k_name_fphi_mpole, + k_name_short_nbor, k_name_special15); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -559,6 +562,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 // host_igrid is allocated with nmax by 4 // - transfer extra data from host to device +// NOTE: can be re-used for fphi_mpole() (already allocate 2x grid points) // --------------------------------------------------------------------------- template @@ -568,7 +572,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out) { - + // update bsorder with that of the kspace solver _bsorder = bsorder; // allocate or resize per-atom arrays @@ -586,7 +590,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); } else { - if (inum_full>_max_thetai_size) { + if (_thetai1.cols()<_max_thetai_size*bsorder) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); _thetai1.resize(_max_thetai_size*bsorder); _thetai2.resize(_max_thetai_size*bsorder); @@ -667,6 +671,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, // --------------------------------------------------------------------------- // fphi_uind = induced potential from grid // fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// NOTE: host_grid_brick is from ic_kspace post_convolution() // --------------------------------------------------------------------------- template @@ -687,7 +692,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1]; n += 2; } - _cgrid_brick.update_device(false); + _cgrid_brick.update_device(_num_grid_points*2, false); const int red_blocks = fphi_uind(); @@ -727,6 +732,63 @@ int BaseAmoebaT::fphi_uind() { return GX; } +// --------------------------------------------------------------------------- +// fphi_mpole = multipole potential from grid (limited to polar_kspace for now) +// fphi_mpole extracts the permanent multipole potential from +// the particle mesh Ewald grid +// NOTE: host_grid_brick is from p_kspace post_convolution() +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi) +{ + // TODO: grid brick[k][j][i] is a scalar + UCL_H_Vec hdummy; + hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); + + int n = 0; + for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) + for (int iy = _nylo_out; iy <= _nyhi_out; iy++) + for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { + _cgrid_brick[n] = host_grid_brick[iz][iy][ix]; + n++; + } + _cgrid_brick.update_device(_num_grid_points, false); + + const int red_blocks = fphi_mpole(); + + _fdip_sum_phi.update_host(_max_thetai_size*20); + + *host_fphi = _fdip_sum_phi.host.begin(); +} + +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_mpole() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + int _nall=atom->nall(); + int nbor_pitch=nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + + time_pair.start(); + int ngridxy = _ngridx * _ngridy; + k_fphi_mpole.set_size(GX,BX); + k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, + &_fdip_sum_phi, &_bsorder, &ainum, + &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); + time_pair.stop(); + + return GX; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- @@ -920,6 +982,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_umutual2b, const char *kname_polar, const char *kname_fphi_uind, + const char *kname_fphi_mpole, const char *kname_short_nbor, const char* kname_special15) { if (_compiled) @@ -935,6 +998,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_umutual2b.set_function(*pair_program, kname_umutual2b); k_polar.set_function(*pair_program, kname_polar); k_fphi_uind.set_function(*pair_program, kname_fphi_uind); + k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole); k_short_nbor.set_function(*pair_program, kname_short_nbor); k_special15.set_function(*pair_program, kname_special15); pos_tex.get_texture(*pair_program, "pos_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a88a63e870..a5ee245623 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -64,8 +64,8 @@ class BaseAmoeba { const double gpu_split, FILE *screen, const void *pair_program, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, - const char *kname_fphi_uind, const char *kname_short_nbor, - const char* kname_special15); + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -185,6 +185,8 @@ class BaseAmoeba { void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); + virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi); + /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, @@ -279,7 +281,8 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_fphi_uind, k_fphi_mpole; UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -305,13 +308,14 @@ class BaseAmoeba { void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, - const char *kname_fphi_uind, const char *kname_short_nbor, - const char* kname_special15); + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int fphi_uind(); + virtual int fphi_mpole(); virtual int polar_real(const int eflag, const int vflag) = 0; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 9917ab91a2..3de6dc544c 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -67,8 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, cell_size,gpu_split,_screen,hippo, "k_hippo_multipole", "k_hippo_udirect2b", "k_hippo_umutual2b", "k_hippo_polar", - "k_hippo_fphi_uind", "k_hippo_short_nbor", - "k_hippo_special15"); + "k_hippo_fphi_uind", "k_hippo_fphi_mpole", + "k_hippo_short_nbor", "k_hippo_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index dde8f9bfd5..91793747ef 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -2346,6 +2346,184 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, } } +/* ---------------------------------------------------------------------- + fphi_mpole = multipole potential from grid + fphi_mpole extracts the permanent multipole potential from + the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +__kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1, + const __global numtyp4 *restrict thetai2, + const __global numtyp4 *restrict thetai3, + const __global int *restrict igrid, + const __global numtyp *restrict grid, + __global numtyp *restrict fphi, + const int bsorder, const int inum, + const int nzlo_out, const int nylo_out, + const int nxlo_out, const int ngridxy, + const int ngridx) +{ + int tid=THREAD_ID_X; + int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; + + if (iinlocal, bsorder, thetai1, thetai2, thetai3, igrid, @@ -1311,6 +1314,8 @@ void PairAmoebaGPU::polar_kspace() double cphid[4],cphip[4]; double a[3][3]; // indices not flipped vs Fortran + bool gpu_fphi_mpole_ready = true; + // indices into the electrostatic field array // decremented by 1 versus Fortran @@ -1373,6 +1378,18 @@ void PairAmoebaGPU::polar_kspace() moduli(); bspline_fill(); + // allocate memory and make early host-device transfers + + // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill + if (gpu_fphi_mpole_ready) { + amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, p_kspace->nzlo_out, + p_kspace->nzhi_out, p_kspace->nylo_out, + p_kspace->nyhi_out, p_kspace->nxlo_out, + p_kspace->nxhi_out); + } + + // convert Cartesian multipoles to fractional coordinates cmp_to_fmp(cmp,fmp); @@ -1441,8 +1458,24 @@ void PairAmoebaGPU::polar_kspace() double ***gridpost = (double ***) p_kspace->post_convolution(); // get potential - - fphi_mpole(gridpost,fphi); + + if (!gpu_fphi_mpole_ready) { + fphi_mpole(gridpost,fphi); + //printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); + } else { + void* fphi_pinned = nullptr; + amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned); + + double *_fphi_ptr = (double *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + //printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); + } for (i = 0; i < nlocal; i++) { for (k = 0; k < 20; k++) From 166701f13a585da635f96d83ad96ab14a90a024c Mon Sep 17 00:00:00 2001 From: ndtrung Date: Fri, 23 Sep 2022 11:53:09 -0500 Subject: [PATCH 123/181] Fixed missing commas in the argument list of the macros in amoeba and hippo cu files, added amoeba_convolution_gpu.cpp and .h to the source file list in GPU.cmake --- cmake/Modules/Packages/GPU.cmake | 4 +++- lib/gpu/lal_amoeba.cu | 2 +- lib/gpu/lal_hippo.cu | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 9524324409..7bb9723485 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -3,7 +3,9 @@ set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h ${GPU_SOURCES_DIR}/fix_gpu.h ${GPU_SOURCES_DIR}/fix_gpu.cpp ${GPU_SOURCES_DIR}/fix_nh_gpu.h - ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp) + ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp) target_compile_definitions(lammps PRIVATE -DLMP_GPU) set(GPU_API "opencl" CACHE STRING "API used by GPU package") diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 6f77fb932f..84a8495dfb 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -158,7 +158,7 @@ _texture( q_tex,int2); fieldp[ii+inum] = fp; \ } -#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 91793747ef..a5fca5cc80 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -158,7 +158,7 @@ _texture( q_tex,int2); fieldp[ii+inum] = fp; \ } -#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ From e6d2582642867d12f3906567e580f0e35feaafce Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 28 Sep 2022 15:08:18 -0500 Subject: [PATCH 124/181] Updated fphi_mpole, renamed precompute_induce to precompute_kspace --- lib/gpu/Nvidia.makefile | 29 +++++++++++- lib/gpu/lal_amoeba.cu | 91 +++++++++++++++++-------------------- lib/gpu/lal_amoeba_ext.cpp | 11 ++--- lib/gpu/lal_base_amoeba.cpp | 25 ++++++---- lib/gpu/lal_base_amoeba.h | 9 ++-- lib/gpu/lal_hippo_ext.cpp | 8 ++-- src/GPU/pair_amoeba_gpu.cpp | 43 +++++++++--------- src/GPU/pair_hippo_gpu.cpp | 14 +++--- 8 files changed, 129 insertions(+), 101 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 5f50486e28..298d404117 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -68,7 +68,34 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H) +$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) + $(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H) + $(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 84a8495dfb..ab750aaadc 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1630,7 +1630,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, - const __global numtyp *restrict grid, + const __global numtyp2 *restrict grid, __global numtyp *restrict fdip_phi1, __global numtyp *restrict fdip_phi2, __global numtyp *restrict fdip_sum_phi, @@ -1648,12 +1648,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, if (ii -void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, +void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, const int nzlo_out, const int nzhi_out, @@ -660,7 +660,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _ngridx = nxhi_out - nxlo_out + 1; _num_grid_points = _ngridx * _ngridy * _ngridz; - int numel = _num_grid_points*2; + int numel = _num_grid_points; if (_cgrid_brick.cols() == 0) { _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); } else if (numel > _cgrid_brick.cols()) { @@ -688,11 +688,13 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++) for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { - _cgrid_brick[n] = host_grid_brick[iz][iy][ix][0]; - _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1]; - n += 2; + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix][0]; + v.y = host_grid_brick[iz][iy][ix][1]; + _cgrid_brick[n] = v; + n++; } - _cgrid_brick.update_device(_num_grid_points*2, false); + _cgrid_brick.update_device(_num_grid_points, false); const int red_blocks = fphi_uind(); @@ -740,7 +742,7 @@ int BaseAmoebaT::fphi_uind() { // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi) +void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) { // TODO: grid brick[k][j][i] is a scalar UCL_H_Vec hdummy; @@ -750,11 +752,15 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++) for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { - _cgrid_brick[n] = host_grid_brick[iz][iy][ix]; + numtyp2 v; + v.x = host_grid_brick[iz][iy][ix]; + v.y = (numtyp)0; + _cgrid_brick[n] = v; n++; } _cgrid_brick.update_device(_num_grid_points, false); + _felec = felec; const int red_blocks = fphi_mpole(); _fdip_sum_phi.update_host(_max_thetai_size*20); @@ -776,13 +782,14 @@ int BaseAmoebaT::fphi_mpole() { // Compute the block size and grid size to keep all cores busy const int BX=block_size(); + //printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D); int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_mpole.set_size(GX,BX); k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, - &_fdip_sum_phi, &_bsorder, &ainum, + &_fdip_sum_phi, &_bsorder, &ainum, &_felec, &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); time_pair.stop(); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a5ee245623..f9a715808e 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -173,8 +173,8 @@ class BaseAmoeba { double **host_uind, double **host_uinp, double *host_pval, const double aewald, const double off2_polar, void **fieldp_ptr); - /// Allocate/resize per-atom arrays before induce() - virtual void precompute_induce(const int inum_full, const int bsorder, + /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar + virtual void precompute_kspace(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, const int nzlo_out, const int nzhi_out, @@ -185,7 +185,8 @@ class BaseAmoeba { void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); - virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi); + virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, + const double felec); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -256,7 +257,7 @@ class BaseAmoeba { int _bsorder; UCL_Vector _thetai1, _thetai2, _thetai3; UCL_Vector _igrid; - UCL_Vector _cgrid_brick; + UCL_Vector _cgrid_brick; UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 1bd6bade3a..a75080bfca 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -193,15 +193,15 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double ** eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } -void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out) { - HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out); + HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); } void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 396ff0b592..d0018bf588 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -88,17 +88,18 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void amoeba_gpu_update_fieldp(void **fieldp_ptr); -void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out); +void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); -void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi); +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi, + const double felec); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -343,7 +344,7 @@ void PairAmoebaGPU::induce() // must be done before the first ufield0c // NOTE: this is for ic_kspace, and thetai[1-3] - amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, thetai3, igrid, ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out, @@ -1382,11 +1383,11 @@ void PairAmoebaGPU::polar_kspace() // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill if (gpu_fphi_mpole_ready) { - amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, - thetai3, igrid, p_kspace->nzlo_out, - p_kspace->nzhi_out, p_kspace->nylo_out, - p_kspace->nyhi_out, p_kspace->nxlo_out, - p_kspace->nxhi_out); + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, + thetai1, thetai2, thetai3, igrid, + p_kspace->nzlo_out, p_kspace->nzhi_out, + p_kspace->nylo_out, p_kspace->nyhi_out, + p_kspace->nxlo_out, p_kspace->nxhi_out); } @@ -1461,10 +1462,15 @@ void PairAmoebaGPU::polar_kspace() if (!gpu_fphi_mpole_ready) { fphi_mpole(gridpost,fphi); - //printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); + + for (i = 0; i < nlocal; i++) { + for (k = 0; k < 20; k++) + fphi[i][k] *= felec; + } + } else { void* fphi_pinned = nullptr; - amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned); + amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); double *_fphi_ptr = (double *)fphi_pinned; for (int i = 0; i < nlocal; i++) { @@ -1474,13 +1480,8 @@ void PairAmoebaGPU::polar_kspace() idx += nlocal; } } - //printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); - } - for (i = 0; i < nlocal; i++) { - for (k = 0; k < 20; k++) - fphi[i][k] *= felec; - } + } // convert field from fractional to Cartesian diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 1151027993..4dbc998ee3 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -105,12 +105,12 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void hippo_gpu_update_fieldp(void **fieldp_ptr); -void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out); +void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); @@ -475,7 +475,7 @@ void PairHippoGPU::induce() // allocate memory and make early host-device transfers // must be done before the first ufield0c - hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, thetai3, igrid, ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out, From 1d75ca3b209dbe8fc2bb14c38d6b12410134231e Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 30 Sep 2022 16:31:13 -0500 Subject: [PATCH 125/181] Moved precompute() out of the terms in amoeba and hippo, to be involed in the first term in a time step: multipole for amoeba and repulsion for hippo --- lib/gpu/lal_amoeba.cpp | 2 +- lib/gpu/lal_amoeba_ext.cpp | 25 +++++++++++++++++-- lib/gpu/lal_base_amoeba.cpp | 21 ++++++++-------- lib/gpu/lal_base_amoeba.h | 2 +- lib/gpu/lal_hippo.cpp | 18 +++++++------- lib/gpu/lal_hippo.h | 6 ++--- lib/gpu/lal_hippo_ext.cpp | 28 ++++++++++++++++++---- src/GPU/pair_amoeba_gpu.cpp | 47 ++++++++++++++++++++++++++---------- src/GPU/pair_hippo_gpu.cpp | 48 ++++++++++++++++++++++++++----------- 9 files changed, 140 insertions(+), 57 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 7be4a6f59c..e3bb4c5ef5 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -162,7 +162,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { this->time_pair.start(); // Build the short neighbor list for the cutoff off2_mpole, - // at this point mpole is the first kernel in a time step + // at this point mpole is the first kernel in a time step for AMOEBA this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 47591e75f6..5e4d48a2da 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -117,7 +117,28 @@ void amoeba_gpu_clear() { AMOEBAMF.clear(); } -int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, +int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { + return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); +} + + +void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, @@ -127,7 +148,7 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 5496236632..16335fa17e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -226,12 +226,12 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist, // --------------------------------------------------------------------------- template inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, - const int nall, double **host_x, - int *host_type, double *sublo, - double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - bool &success) { + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + bool &success) { success=true; resize_atom(inum,nall,success); resize_local(inum,host_inum,nbor->max_nbors(),success); @@ -450,7 +450,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall // Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- template -int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, +void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *host_pval, @@ -469,7 +469,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // NOTE: // Once all the kernels are ready, precompute() is needed only once // in the first kernel in a time step. - +/* int** firstneigh = nullptr; firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, @@ -478,7 +478,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - +*/ // ------------------- Resize _tep array ------------------------ if (inum_full>_max_tep_size) { @@ -503,7 +503,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, _tep.update_host(_max_tep_size*4,false); - return firstneigh; // nbor->host_jlist.begin()-host_start; +// return firstneigh; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -782,7 +782,6 @@ int BaseAmoebaT::fphi_mpole() { // Compute the block size and grid size to keep all cores busy const int BX=block_size(); - //printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D); int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); time_pair.start(); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index f9a715808e..d00833cae7 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -152,7 +152,7 @@ class BaseAmoeba { double *charge, double *boxlo, double *prd); /// Compute multipole real-space with device neighboring - virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, + virtual void compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 3de6dc544c..dc2b6f2c7a 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -172,7 +172,7 @@ double HippoT::host_memory_usage() const { // Reneighbor on GPU if necessary, and then compute repulsion // --------------------------------------------------------------------------- template -int** HippoT::compute_repulsion(const int ago, const int inum_full, +void HippoT::compute_repulsion(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, @@ -213,7 +213,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, // We only need to cast the necessary from host to device here // if the neighbor lists are rebuilt and other per-atom arrays // (x, type, amtype, amgroup, rpole) are ready on the device. - +/* int** firstneigh = nullptr; firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, @@ -222,7 +222,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - +*/ // ------------------- Resize _tep array ------------------------ if (inum_full>this->_max_tep_size) { @@ -253,7 +253,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full, this->_tep.update_host(this->_max_tep_size*4,false); - return firstneigh; // nbor->host_jlist.begin()-host_start; +// return firstneigh; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -275,7 +275,7 @@ int HippoT::repulsion(const int eflag, const int vflag) { this->time_pair.start(); // Build the short neighbor list for the cutoff off2_disp, - // at this point mpole is the first kernel in a time step + // at this point repuslion is the first kernel in a time step for HIPPO this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -302,7 +302,7 @@ int HippoT::repulsion(const int eflag, const int vflag) { // Reneighbor on GPU if necessary, and then compute dispersion real-space // --------------------------------------------------------------------------- template -int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, +void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole, const double aewald, const double off2_disp) { @@ -324,7 +324,7 @@ int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, this->hd_balancer.stop_timer(); - return nullptr; // nbor->host_jlist.begin()-host_start; + // return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -372,7 +372,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { // Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- template -int** HippoT::compute_multipole_real(const int ago, const int inum_full, +void HippoT::compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, @@ -417,7 +417,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, this->_tep.update_host(this->_max_tep_size*4,false); - return nullptr; // nbor->host_jlist.begin()-host_start; + //return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index 492712eb85..671c9964ff 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -55,7 +55,7 @@ class Hippo : public BaseAmoeba { const double polar_dscale, const double polar_uscale); /// Compute repulsion with device neighboring - int** compute_repulsion(const int ago, const int inum_full, + virtual void compute_repulsion(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, @@ -72,12 +72,12 @@ class Hippo : public BaseAmoeba { double c3, double c4, double c5,void** tep_ptr); /// Compute dispersion real-space with device neighboring - int** compute_dispersion_real(int *host_amtype, int *host_amgroup, + virtual void compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole, const double aewald, const double off2_disp); /// Compute multipole real-space with device neighboring - virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, + virtual void compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index a75080bfca..9644f5aca4 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -120,7 +120,27 @@ void hippo_gpu_clear() { HIPPOMF.clear(); } -int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, +int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { + return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); +} + +void hippo_gpu_compute_repulsion(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, @@ -132,7 +152,7 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, double *host_q, double *boxlo, double *prd, double cut2, double c0, double c1, double c2, double c3, double c4, double c5, void **tep_ptr) { - return HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, + HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, @@ -147,7 +167,7 @@ void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, aewald, off2); } -int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, +void hippo_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, @@ -157,7 +177,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, + HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index d0018bf588..8e021f5ce8 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -69,7 +69,19 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas const double polar_dscale, const double polar_uscale, int& tq_size); void amoeba_gpu_clear(); -int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, +int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd); + +void amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, @@ -240,6 +252,18 @@ void PairAmoebaGPU::multipole_real() } inum = atom->nlocal; + firstneigh = amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + // select the correct cutoff for the term if (use_ewald) choose(MPOLE_LONG); @@ -249,18 +273,17 @@ void PairAmoebaGPU::multipole_real() double felec = electric / am_dielectric; - firstneigh = amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tq_pinned); + amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); - if (!success) - error->one(FLERR,"Insufficient memory on accelerator"); + // reference to the tep array from GPU lib diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 4dbc998ee3..7658ddb011 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -70,7 +70,19 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const double polar_dscale, const double polar_uscale, int& tq_size); void hippo_gpu_clear(); -int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, +int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd); + +void hippo_gpu_compute_repulsion(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, @@ -86,7 +98,7 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full, void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole, const double aewald, const double off2); -int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, +void hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, @@ -258,22 +270,30 @@ void PairHippoGPU::repulsion() } inum = atom->nlocal; + firstneigh = hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + // select the correct cutoff for the term choose(REPULSE); - // set the energy unit conversion factor for multipolar real-space calculation - - firstneigh = hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, off2, atom->q, - domain->boxlo, domain->prd, cut2, - c0, c1, c2, c3, c4, c5, &tq_pinned); + hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd, cut2, + c0, c1, c2, c3, c4, c5, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); From 9a1f23a0793ce30e1fa5a835b57c1724e830ef36 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 30 Sep 2022 17:32:25 -0500 Subject: [PATCH 126/181] Cosmetic changes and cleanup --- lib/gpu/lal_amoeba.cpp | 10 +++-- lib/gpu/lal_base_amoeba.cpp | 28 ++++--------- lib/gpu/lal_hippo.cpp | 83 ++++++++++++++----------------------- 3 files changed, 44 insertions(+), 77 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index e3bb4c5ef5..dfe092c52b 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -185,7 +185,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the real-space permanent field, returning field and fieldp +// Launch the real-space permanent field kernel // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { @@ -202,7 +202,9 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list if not done yet + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + if (!this->short_nbor_polar_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -225,7 +227,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the real-space induced field, returning field and fieldp +// Launch the real-space induced field kernel, returning field and fieldp // --------------------------------------------------------------------------- template int AmoebaT::umutual2b(const int eflag, const int vflag) { @@ -264,7 +266,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Launch the polar real-space kernel, returning tep // --------------------------------------------------------------------------- template int AmoebaT::polar_real(const int eflag, const int vflag) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 16335fa17e..17e05b4a16 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -447,7 +447,9 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute multipole real-space +// Compute multipole real-space part +// precompute() should be already invoked before mem (re)allocation +// this is the first part in a time step done on the GPU for AMOEBA for now // --------------------------------------------------------------------------- template void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, @@ -464,21 +466,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. -/* - int** firstneigh = nullptr; - firstneigh = precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); -*/ // ------------------- Resize _tep array ------------------------ if (inum_full>_max_tep_size) { @@ -502,8 +489,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, // copy tep from device to host _tep.update_host(_max_tep_size*4,false); - -// return firstneigh; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -842,22 +827,23 @@ double BaseAmoebaT::host_memory_usage_atomic() const { } // --------------------------------------------------------------------------- -// Setup the FFT plan +// Setup the FFT plan: only placeholder for now // --------------------------------------------------------------------------- template void BaseAmoebaT::setup_fft(const int numel, const int element_type) { - + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) } // --------------------------------------------------------------------------- -// Compute FFT on the device +// Compute FFT on the device: only placeholder for now // --------------------------------------------------------------------------- template void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode) { + // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) #if !defined(USE_OPENCL) && !defined(USE_HIP) if (fft_plan_created == false) { int m = numel/2; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index dc2b6f2c7a..221fe16f3c 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -143,8 +143,12 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, _polar_uscale = polar_uscale; _allocated=true; - this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + coeff_amclass.row_bytes() + - + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes(); + this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + + coeff_amclass.row_bytes() + sp_polar.row_bytes() + + sp_nonpolar.row_bytes() + this->_tep.row_bytes() + + this->_fieldp.row_bytes() + this->_thetai1.row_bytes() + + this->_thetai2.row_bytes() + this->_thetai3.row_bytes() + + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes(); return 0; } @@ -169,7 +173,7 @@ double HippoT::host_memory_usage() const { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute repulsion +// Compute the repulsion term, returning tep // --------------------------------------------------------------------------- template void HippoT::compute_repulsion(const int ago, const int inum_full, @@ -203,26 +207,6 @@ void HippoT::compute_repulsion(const int ago, const int inum_full, this->set_kernel(eflag,vflag); - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. - // We only need to cast the necessary from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. -/* - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); -*/ // ------------------- Resize _tep array ------------------------ if (inum_full>this->_max_tep_size) { @@ -252,12 +236,10 @@ void HippoT::compute_repulsion(const int ago, const int inum_full, // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); - -// return firstneigh; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- -// Calculate the repulsion term, returning tep +// Launch the repulsion kernel // --------------------------------------------------------------------------- template int HippoT::repulsion(const int eflag, const int vflag) { @@ -299,7 +281,7 @@ int HippoT::repulsion(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute dispersion real-space +// Compute dispersion real-space // --------------------------------------------------------------------------- template void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, @@ -323,12 +305,10 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, //this->device->add_ans_object(this->ans); this->hd_balancer.stop_timer(); - - // return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- -// Calculate the dispersion real-space term, returning tep +// Launch the dispersion real-space kernel // --------------------------------------------------------------------------- template int HippoT::dispersion_real(const int eflag, const int vflag) { @@ -346,7 +326,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { this->time_pair.start(); // Build the short neighbor list for the cutoff off2_disp, - // at this point mpole is the first kernel in a time step + // at this point dispersion is the first kernel in a time step this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -356,20 +336,20 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { k_dispersion.set_size(GX,BX); k_dispersion.run(&this->atom->x, &this->atom->extra, - &coeff_amtype, &coeff_amclass, &sp_nonpolar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, - &this->ans->force, &this->ans->engv, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, &this->_aewald, - &this->_off2_disp); + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_disp); this->time_pair.stop(); return GX; } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute multipole real-space +// Compute the multipole real-space term, returning tep // --------------------------------------------------------------------------- template void HippoT::compute_multipole_real(const int ago, const int inum_full, @@ -416,12 +396,10 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full, // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); - - //return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- -// Calculate the multipole real-space term, returning tep +// Launch the multipole real-space kernel // --------------------------------------------------------------------------- template int HippoT::multipole_real(const int eflag, const int vflag) { @@ -438,8 +416,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list for the cutoff off2_mpole, - // at this point mpole is the first kernel in a time step + // Build the short neighbor list for the cutoff off2_mpole this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -462,8 +439,8 @@ int HippoT::multipole_real(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute the direct real space part -// of the permanent field +// Compute the direct real space part of the permanent field +// returning field and fieldp // --------------------------------------------------------------------------- template void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -488,7 +465,7 @@ void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **hos } // --------------------------------------------------------------------------- -// Calculate the real-space permanent field, returning field and fieldp +// Launch the real-space permanent field kernel // --------------------------------------------------------------------------- template int HippoT::udirect2b(const int eflag, const int vflag) { @@ -505,7 +482,9 @@ int HippoT::udirect2b(const int eflag, const int vflag) { (BX/this->_threads_per_atom))); this->time_pair.start(); - // Build the short neighbor list if not done yet + // Build the short neighbor list for the cutoff _off2_polar, if not done yet + // this is the first kernel in a time step where _off2_polar is used + if (!this->short_nbor_polar_avail) { this->k_short_nbor.set_size(GX,BX); this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, @@ -529,8 +508,8 @@ int HippoT::udirect2b(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute the direct real space part -// of the induced field +// Compute the direct real space term of the induced field +// returning field and fieldp // --------------------------------------------------------------------------- template void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -554,7 +533,7 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos } // --------------------------------------------------------------------------- -// Calculate the real-space induced field, returning field and fieldp +// Launch the real-space induced field kernel // --------------------------------------------------------------------------- template int HippoT::umutual2b(const int eflag, const int vflag) { @@ -628,7 +607,7 @@ void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **ho } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Launch the polar real-space kernel // --------------------------------------------------------------------------- template int HippoT::polar_real(const int eflag, const int vflag) { From 009ed3630124740593603c0752b6741c92a7c8c6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 1 Oct 2022 11:16:30 -0500 Subject: [PATCH 127/181] Updated src/GPU Install.sh to include amoeba_convolution_gpu.* --- src/GPU/Install.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index d91b744c4e..48c47ae96d 100755 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -28,6 +28,8 @@ action () { # list of files with optional dependcies +action amoeba_convolution_gpu.cpp amoeba_convolution.cpp +action amoeba_convolution_gpu.h amoeba_convolution.cpp action fix_gpu.cpp action fix_gpu.h action fix_nve_gpu.h @@ -117,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h action pair_lj_cut_gpu.cpp action pair_lj_cut_gpu.h +action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp +action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp +action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp +action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp action pair_lj_smooth_gpu.h pair_lj_smooth.cpp action pair_lj_expand_gpu.cpp @@ -159,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp action pppm_gpu.h pppm.cpp action pair_ufm_gpu.cpp pair_ufm.cpp action pair_ufm_gpu.h pair_ufm.h -action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp -action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp -action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp -action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp # edit 2 Makefile.package files to include/exclude package info From 6b9e83fe2093fafc2167fde727c77d6b4ed2e735 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 6 Oct 2022 15:03:58 -0500 Subject: [PATCH 128/181] Added timing for the induced dipole spreading part, computed the block size to ensure all the CUs are occupied by the fphi_uind and fphi_mpole kernels --- lib/gpu/lal_amoeba.cpp | 11 ++++-- lib/gpu/lal_base_amoeba.cpp | 64 +++++++++++++++++++++++++--------- lib/gpu/lal_base_amoeba.h | 4 +++ lib/gpu/lal_device.cpp | 1 + lib/gpu/lal_device.h | 4 ++- lib/gpu/lal_hippo.cpp | 11 ++++-- src/AMOEBA/amoeba_induce.cpp | 9 ++++- src/AMOEBA/pair_amoeba.cpp | 13 +++++-- src/AMOEBA/pair_amoeba.h | 10 +++--- src/GPU/pair_amoeba_gpu.cpp | 66 ++++++------------------------------ 10 files changed, 106 insertions(+), 87 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index dfe092c52b..b61d7595af 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -278,9 +278,14 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); + const int max_cus = this->device->max_cus(); + int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + while (GX < max_cus) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + } + this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index c79804dd95..3b2381f211 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -155,7 +155,14 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) fft_plan_created = false; + #endif + + #ifdef ASYNC_DEVICE_COPY + _end_command_queue=ucl_device->num_queues(); + ucl_device->push_command_queue(); + #endif return success; } @@ -507,6 +514,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double *fieldp_ptr=_fieldp.host.begin(); + // specify the correct cutoff and alpha values _off2_polar = off2_polar; _aewald = aewald; const int red_blocks=udirect2b(_eflag,_vflag); @@ -525,18 +533,20 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double double **host_uind, double **host_uinp, double *host_pval, const double aewald, const double off2_polar, void** fieldp_ptr) { - // all the necessary data arrays are already copied from host to device - - //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + // only copy the necessary data arrays that are updated over the iterations + // use nullptr for the other arrays that are already copied from host to device cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr); atom->add_extra_data(); + // set the correct cutoff and alpha _off2_polar = off2_polar; _aewald = aewald; + // launch the kernel const int red_blocks=umutual2b(_eflag,_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) // NOTE: move this step to update_fieldp() to delay device-host transfer + // after umutual1 and self are done on the GPU // *fieldp_ptr=_fieldp.host.begin(); // _fieldp.update_host(_max_fieldp_size*8,false); } @@ -547,7 +557,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 // host_igrid is allocated with nmax by 4 // - transfer extra data from host to device -// NOTE: can be re-used for fphi_mpole() (already allocate 2x grid points) +// NOTE: can be re-used for fphi_mpole() but with a different bsorder value // --------------------------------------------------------------------------- template @@ -588,6 +598,12 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, } } + #ifdef ASYNC_DEVICE_COPY + _thetai1.cq(ucl_device->cq(_end_command_queue)); + _thetai2.cq(ucl_device->cq(_end_command_queue)); + _thetai3.cq(ucl_device->cq(_end_command_queue)); + #endif + // pack host data to device for (int i = 0; i < inum_full; i++) @@ -634,6 +650,8 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, } _igrid.update_device(true); + // _cgrid_brick holds the grid-based potential + _nzlo_out = nzlo_out; _nzhi_out = nzhi_out; _nylo_out = nylo_out; @@ -679,14 +697,21 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, _cgrid_brick[n] = v; n++; } - _cgrid_brick.update_device(_num_grid_points, false); + _cgrid_brick.update_device(_num_grid_points, true); + #ifdef ASYNC_DEVICE_COPY + ucl_device->sync(); + #endif + + // launch the kernel with its execution configuration (see below) const int red_blocks = fphi_uind(); - _fdip_phi1.update_host(_max_thetai_size*10); - _fdip_phi2.update_host(_max_thetai_size*10); - _fdip_sum_phi.update_host(_max_thetai_size*20); + // copy data from device to host asynchronously + _fdip_phi1.update_host(_max_thetai_size*10, true); + _fdip_phi2.update_host(_max_thetai_size*10, true); + _fdip_sum_phi.update_host(_max_thetai_size*20, true); + // return the pointers to the host-side arrays *host_fdip_phi1 = _fdip_phi1.host.begin(); *host_fdip_phi2 = _fdip_phi2.host.begin(); *host_fdip_sum_phi = _fdip_sum_phi.host.begin(); @@ -701,13 +726,15 @@ int BaseAmoebaT::fphi_uind() { if (ainum == 0) return 0; - int _nall=atom->nall(); - int nbor_pitch=nbor->nbor_pitch(); - // Compute the block size and grid size to keep all cores busy - const int BX=block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); - + const int max_cus = device->max_cus(); + int BX=block_size(); + int GX=static_cast(ceil(static_cast(ainum)/BX)); + while (GX < max_cus) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/BX)); + } + time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); @@ -766,8 +793,13 @@ int BaseAmoebaT::fphi_mpole() { int nbor_pitch=nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int BX=block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + const int max_cus = device->max_cus(); + int BX=block_size(); + int GX=static_cast(ceil(static_cast(ainum)/BX)); + while (GX < max_cus) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/BX)); + } time_pair.start(); int ngridxy = _ngridx * _ngridy; diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index d00833cae7..2e992a33d9 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -31,6 +31,8 @@ #include "geryon/nvd_texture.h" #endif +//#define ASYNC_DEVICE_COPY + #if !defined(USE_OPENCL) && !defined(USE_HIP) // temporary workaround for int2 also defined in cufft #ifdef int2 @@ -263,6 +265,8 @@ class BaseAmoeba { int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _ngridx, _ngridy, _ngridz, _num_grid_points; + int _end_command_queue; + // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 039970a0d3..89ae503a97 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -214,6 +214,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, } } _first_device = _last_device = best_device; + _max_cus = best_cus; type = gpu->device_type(_first_device); if (ndevices > 0) { diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 74f802a096..7def4b7f82 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -241,6 +241,8 @@ class Device { inline int shuffle_avail() const { return _shuffle_avail; } /// For OpenCL, 0 if fast-math options disabled, 1 enabled inline int fast_math() const { return _fast_math; } + /// return the max number of CUs among the devices + inline int max_cus() const { return _max_cus; } /// Return the number of threads per atom for pair styles inline int threads_per_atom() const { return _threads_per_atom; } @@ -324,7 +326,7 @@ class Device { private: std::queue *> ans_queue; - int _init_count; + int _init_count, _max_cus; bool _device_init, _host_timer_started, _time_device; MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 221fe16f3c..d8ef3e9a44 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -619,9 +619,14 @@ int HippoT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); + const int max_cus = this->device->max_cus(); + int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + while (GX < max_cus) { + BX /= 2; + GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + } + this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index f10535a36a..69125854f9 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -901,14 +901,22 @@ void PairAmoeba::umutual1(double **field, double **fieldp) } } + double time0, time1; + // gridpre = my portion of 4d grid in brick decomp w/ ghost values double ****gridpre = (double ****) ic_kspace->zero(); // map 2 values to grid + MPI_Barrier(world); + time0 = MPI_Wtime(); + grid_uind(fuind,fuinp,gridpre); + time1 = MPI_Wtime(); + time_grid_uind += (time1 - time0); + // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition @@ -945,7 +953,6 @@ void PairAmoeba::umutual1(double **field, double **fieldp) double ****gridpost = (double ****) ic_kspace->post_convolution(); // get potential - double time0, time1; MPI_Barrier(world); time0 = MPI_Wtime(); diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index a164fc4d9c..75c749e61f 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -367,7 +367,7 @@ void PairAmoeba::compute(int eflag, int vflag) time_mutual_rspace = time_mutual_kspace = 0.0; time_polar_rspace = time_polar_kspace = 0.0; - time_fphi_uind = 0.0; + time_grid_uind = time_fphi_uind = 0.0; if (ic_kspace) { ic_kspace->time_fft = 0.0; } @@ -566,6 +566,9 @@ void PairAmoeba::finish() MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_polar_kspace = ave/comm->nprocs; + MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_grid_uind = ave/comm->nprocs; + MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_fphi_uind = ave/comm->nprocs; @@ -592,15 +595,19 @@ void PairAmoeba::finish() utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total); utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0); - utils::logmesg(lmp," Real-space timing breakdown:\n"); + double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace; + double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace; + + utils::logmesg(lmp," Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total); utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); - utils::logmesg(lmp," K-space timing breakdown:\n"); + utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total); utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); + utils::logmesg(lmp," - Grid : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total); utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index af40f4a6ad..781d8a1e2f 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -80,11 +80,11 @@ class PairAmoeba : public Pair { double time_init, time_hal, time_repulse, time_disp; double time_mpole, time_induce, time_polar, time_qxfer; - double time_mpole_rspace,time_mpole_kspace; - double time_direct_rspace,time_direct_kspace; - double time_mutual_rspace,time_mutual_kspace; - double time_polar_rspace,time_polar_kspace; - double time_fphi_uind; + double time_mpole_rspace, time_mpole_kspace; + double time_direct_rspace, time_direct_kspace; + double time_mutual_rspace, time_mutual_kspace; + double time_polar_rspace, time_polar_kspace; + double time_grid_uind, time_fphi_uind; // energy/virial components diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 49698f1825..6b977cb638 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -930,15 +930,6 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) memset(&field[0][0], 0, 3*nall *sizeof(double)); memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); -/* - for (int i = 0; i < nall; i++) { - for (int j = 0; j < 3; j++) { - field[i][j] = 0.0; - fieldp[i][j] = 0.0; - } - } -*/ - // get the real space portion of the mutual field first MPI_Barrier(world); @@ -960,19 +951,13 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) field[i][1] += term*uind[i][1]; field[i][2] += term*uind[i][2]; } + for (int i = 0; i < nlocal; i++) { fieldp[i][0] += term*uinp[i][0]; fieldp[i][1] += term*uinp[i][1]; fieldp[i][2] += term*uinp[i][2]; } -/* - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - field[i][j] += term*uind[i][j]; - fieldp[i][j] += term*uinp[i][j]; - } - } -*/ + // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU // field and fieldp may already have some nonzero values from kspace (umutual1 and self) @@ -1029,7 +1014,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) } int nlocal = atom->nlocal; - for (int i = 0; i < nlocal; i++) { fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; @@ -1041,22 +1025,23 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; } -/* - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; - fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; - } - } -*/ + + double time0, time1; + // gridpre = my portion of 4d grid in brick decomp w/ ghost values double ****gridpre = (double ****) ic_kspace->zero(); // map 2 values to grid + MPI_Barrier(world); + time0 = MPI_Wtime(); + grid_uind(fuind,fuinp,gridpre); + time1 = MPI_Wtime(); + time_grid_uind += (time1 - time0); + // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition @@ -1093,9 +1078,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) double ****gridpost = (double ****) ic_kspace->post_convolution(); // get potential - double time0, time1; - - MPI_Barrier(world); time0 = MPI_Wtime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); @@ -1114,14 +1096,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) } } - // convert the dipole fields from fractional to Cartesian - - for (int i = 0; i < 3; i++) { - a[0][i] = nfft1 * recip[0][i]; - a[1][i] = nfft2 * recip[1][i]; - a[2][i] = nfft3 * recip[2][i]; - } - for (int i = 0; i < nlocal; i++) { double dfx = a[0][0]*fdip_phi1[i][1] + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; @@ -1145,25 +1119,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) fieldp[i][1] -= dfy; fieldp[i][2] -= dfz; } -/* - for (int i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] + - a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3]; - dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] + - a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3]; - } - } - // increment the field at each multipole site - - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - field[i][j] -= dipfield1[i][j]; - fieldp[i][j] -= dipfield2[i][j]; - } - } -*/ } /* ---------------------------------------------------------------------- From 00f46120c79f841dcecf78d75e7498bf7a3fc708 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 7 Oct 2022 15:50:30 -0500 Subject: [PATCH 129/181] Removed max_cus() from Device, used device->gpu->cus() instead --- lib/gpu/lal_amoeba.cpp | 4 ++-- lib/gpu/lal_base_amoeba.cpp | 8 ++++---- lib/gpu/lal_device.cpp | 1 - lib/gpu/lal_device.h | 4 +--- lib/gpu/lal_hippo.cpp | 4 ++-- 5 files changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index b61d7595af..1c0aa77706 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -278,10 +278,10 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int max_cus = this->device->max_cus(); + const int cus = this->device->gpu->cus(); int BX=this->block_size(); int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - while (GX < max_cus) { + while (GX < cus) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 3b2381f211..8e4e8faf83 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -727,10 +727,10 @@ int BaseAmoebaT::fphi_uind() { return 0; // Compute the block size and grid size to keep all cores busy - const int max_cus = device->max_cus(); + const int cus = device->gpu->cus(); int BX=block_size(); int GX=static_cast(ceil(static_cast(ainum)/BX)); - while (GX < max_cus) { + while (GX < cus) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/BX)); } @@ -793,10 +793,10 @@ int BaseAmoebaT::fphi_mpole() { int nbor_pitch=nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int max_cus = device->max_cus(); + const int cus = device->gpu->cus(); int BX=block_size(); int GX=static_cast(ceil(static_cast(ainum)/BX)); - while (GX < max_cus) { + while (GX < cus) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/BX)); } diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 89ae503a97..039970a0d3 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -214,7 +214,6 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, } } _first_device = _last_device = best_device; - _max_cus = best_cus; type = gpu->device_type(_first_device); if (ndevices > 0) { diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 7def4b7f82..74f802a096 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -241,8 +241,6 @@ class Device { inline int shuffle_avail() const { return _shuffle_avail; } /// For OpenCL, 0 if fast-math options disabled, 1 enabled inline int fast_math() const { return _fast_math; } - /// return the max number of CUs among the devices - inline int max_cus() const { return _max_cus; } /// Return the number of threads per atom for pair styles inline int threads_per_atom() const { return _threads_per_atom; } @@ -326,7 +324,7 @@ class Device { private: std::queue *> ans_queue; - int _init_count, _max_cus; + int _init_count; bool _device_init, _host_timer_started, _time_device; MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index d8ef3e9a44..f20a0cfd62 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -619,10 +619,10 @@ int HippoT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int max_cus = this->device->max_cus(); + const int cus = this->device->gpu->cus(); int BX=this->block_size(); int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - while (GX < max_cus) { + while (GX < cus) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); } From 2f1f7ee0fa49d79a970adad810ec290d509933f4 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 3 Nov 2022 23:45:40 -0500 Subject: [PATCH 130/181] Cleaned up code --- lib/gpu/lal_amoeba.cu | 24 +++++++----------------- src/GPU/pair_amoeba_gpu.cpp | 1 + src/GPU/pair_hippo_gpu.cpp | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 27 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index ab750aaadc..cc593e4263 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1639,10 +1639,6 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, const int nxlo_out, const int ngridxy, const int ngridx) { - //int tid, ii, offset, i, n_stride; - //atom_info(t_per_atom,ii,tid,offset); - - int tid=THREAD_ID_X; int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; @@ -1763,23 +1759,17 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, */ const int i1 = istart + ib; const numtyp4 tha1 = thetai1[i1]; - /* - const numtyp w0 = tha1.x; - const numtyp w1 = tha1.y; - const numtyp w2 = tha1.z; - const numtyp w3 = tha1.w; - */ const int gidx = my + i; // k*ngridxy + j*ngridx + i; const numtyp2 tq = grid[gidx]; const numtyp tq_1 = tq.x; //grid[gidx]; const numtyp tq_2 = tq.y; //grid[gidx+1]; - t0_1 += tq_1*tha1.x; // w0 - t1_1 += tq_1*tha1.y; // w1 - t2_1 += tq_1*tha1.z; // w2 - t0_2 += tq_2*tha1.x; // w0 - t1_2 += tq_2*tha1.y; // w1 - t2_2 += tq_2*tha1.z; // w2 - t3 += (tq_1+tq_2)*tha1.w; // w3 + t0_1 += tq_1*tha1.x; + t1_1 += tq_1*tha1.y; + t2_1 += tq_1*tha1.z; + t0_2 += tq_2*tha1.x; + t1_2 += tq_2*tha1.y; + t2_2 += tq_2*tha1.z; + t3 += (tq_1+tq_2)*tha1.w; i++; } diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 6b977cb638..fa0670a757 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -1078,6 +1078,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) double ****gridpost = (double ****) ic_kspace->post_convolution(); // get potential + time0 = MPI_Wtime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index b874c656c3..49a83e75be 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -1170,22 +1170,24 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; } -/* - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; - fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; - } - } -*/ + + double time0, time1; + // gridpre = my portion of 4d grid in brick decomp w/ ghost values double ****gridpre = (double ****) ic_kspace->zero(); // map 2 values to grid + + MPI_Barrier(world); + time0 = MPI_Wtime(); + grid_uind(fuind,fuinp,gridpre); + time1 = MPI_Wtime(); + time_grid_uind += (time1 - time0); + // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition @@ -1222,9 +1224,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) double ****gridpost = (double ****) ic_kspace->post_convolution(); // get potential - double time0, time1; - MPI_Barrier(world); time0 = MPI_Wtime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); From a3cc0e8432495d70cb1bb4ea8dc8a51c43841f20 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 4 Nov 2022 13:45:59 -0500 Subject: [PATCH 131/181] Reverted the block size tuning, which caused bugs for low atom counts (will revisit later) --- lib/gpu/lal_amoeba.cpp | 10 ++++++---- lib/gpu/lal_base_amoeba.cpp | 22 +++++++++++++--------- lib/gpu/lal_hippo.cpp | 12 +++++++----- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 1c0aa77706..38aa2bde27 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -278,14 +278,16 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* const int cus = this->device->gpu->cus(); - int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - while (GX < cus) { + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); } - + */ this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 8e4e8faf83..e6ffcd764a 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -727,14 +727,16 @@ int BaseAmoebaT::fphi_uind() { return 0; // Compute the block size and grid size to keep all cores busy - const int cus = device->gpu->cus(); - int BX=block_size(); - int GX=static_cast(ceil(static_cast(ainum)/BX)); - while (GX < cus) { + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/BX)); } - + */ time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); @@ -793,14 +795,16 @@ int BaseAmoebaT::fphi_mpole() { int nbor_pitch=nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + /* const int cus = device->gpu->cus(); - int BX=block_size(); - int GX=static_cast(ceil(static_cast(ainum)/BX)); - while (GX < cus) { + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/BX)); } - + */ time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_mpole.set_size(GX,BX); diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index f20a0cfd62..d4366cac85 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -619,14 +619,16 @@ int HippoT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int cus = this->device->gpu->cus(); - int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - while (GX < cus) { + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); } - + */ this->time_pair.start(); // Build the short neighbor list if not done yet From 959b9c220fabc63f8e87ce45aacb6acb0a14ca7b Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 7 Nov 2022 15:49:37 -0600 Subject: [PATCH 132/181] Cleaned up unused member functions and hd_balancer calls --- lib/gpu/lal_base_amoeba.cpp | 102 ++---------------------------------- lib/gpu/lal_base_amoeba.h | 14 +---- lib/gpu/lal_hippo.cpp | 52 ++++++------------ lib/gpu/lal_hippo_ext.cpp | 78 +++++++++++++-------------- 4 files changed, 59 insertions(+), 187 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index e6ffcd764a..a9c76d578e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -270,99 +270,6 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, return mn; } -// --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials -// for the polar real-space term -// --------------------------------------------------------------------------- -template -void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - int *ilist, int *numj, int **firstneigh, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, const double aewald, const double felec, - const double off2_polar, double *host_q, const int nlocal, - double *boxlo, double *prd, void **tep_ptr) { - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); - - // ------------------- Resize _tep array ------------------------ - - if (nall>_max_tep_size) { - _max_tep_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_tep_size*4); - - dev_nspecial15.clear(); - dev_special15.clear(); - dev_special15_t.clear(); - dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); - } - - *tep_ptr=_tep.host.begin(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - resize_atom(0,nall,success); - zero_timers(); - return; - } - - int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(ago,inum_full,cpu_time); - ans->inum(inum); - host_start=inum; - - if (ago==0) { - reset_nbors(nall, inum, ilist, numj, firstneigh, success); - if (!success) - return; - } - - // packing host arrays into host_extra - - atom->cast_x_data(host_x,host_type); - atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp); - hd_balancer.start_timer(); - atom->add_x_data(host_x,host_type); - atom->add_q_data(); - atom->add_extra_data(); - - device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, - boxlo, prd); - - _off2_polar = off2_polar; - _felec = felec; - const int red_blocks=polar_real(eflag,vflag); - - ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); - device->add_ans_object(ans); - hd_balancer.stop_timer(); - - // copy tep from device to host - - _tep.update_host(_max_tep_size*4,false); -} - // --------------------------------------------------------------------------- // Prepare for multiple kernel calls in a time step: // - reallocate per-atom arrays, if needed @@ -450,6 +357,8 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall dev_short_nbor.resize((2+_max_nbors)*_nmax); } + hd_balancer.stop_timer(); + return nbor->host_jlist.begin()-host_start; } @@ -491,8 +400,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); //device->add_ans_object(ans); - hd_balancer.stop_timer(); - // copy tep from device to host _tep.update_host(_max_tep_size*4,false); @@ -828,7 +735,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, const double aewald, const double felec, const double off2_polar, void **tep_ptr) { - int** firstneigh = nullptr; + // cast necessary data arrays from host to device cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); @@ -845,10 +752,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); - hd_balancer.stop_timer(); - // copy tep from device to host - _tep.update_host(_max_tep_size*4,false); } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 2e992a33d9..0fb2469d23 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -182,11 +182,12 @@ class BaseAmoeba { const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out); - + /// Interpolate the induced potential from the grid virtual void compute_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); + /// Interpolate the multipolar potential from the grid virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec); @@ -198,17 +199,6 @@ class BaseAmoeba { const double aewald, const double felec, const double off2_polar, void **tep_ptr); - /// Compute polar real-space with host neighboring (not active for now) - void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, const double aewald, const double felec, - const double off2_polar, double *charge, const int nlocal, double *boxlo, - double *prd, void **tep_ptr); - // copy field and fieldp from device to host after umutual2b virtual void update_fieldp(void **fieldp_ptr) { *fieldp_ptr=_fieldp.host.begin(); diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index d4366cac85..334d75ac26 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -177,20 +177,20 @@ double HippoT::host_memory_usage() const { // --------------------------------------------------------------------------- template void HippoT::compute_repulsion(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_repulse, - double *host_q, double *boxlo, double *prd, - double cut2, double c0, double c1, double c2, - double c3, double c4, double c5, void **tep_ptr) { + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_repulse, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { this->acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -225,16 +225,7 @@ void HippoT::compute_repulsion(const int ago, const int inum_full, _c5 = c5; const int red_blocks=repulsion(this->_eflag,this->_vflag); - // only copy them back if this is the last kernel - // otherwise, commenting out these two lines to leave the answers - // (forces, energies and virial) on the device until the last kernel - //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //this->device->add_ans_object(this->ans); - - this->hd_balancer.stop_timer(); - // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); } @@ -303,8 +294,6 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, // (forces, energies and virial) on the device until the last kernel //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); //this->device->add_ans_object(this->ans); - - this->hd_balancer.stop_timer(); } // --------------------------------------------------------------------------- @@ -386,15 +375,7 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full, this->_aewald = aewald; const int red_blocks=multipole_real(this->_eflag,this->_vflag); - // leave the answers (forces, energies and virial) on the device, - // only copy them back in the last kernel (this one, or polar_real once done) - //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //this->device->add_ans_object(this->ans); - - this->hd_balancer.stop_timer(); - // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); } @@ -595,14 +576,11 @@ void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **ho const int red_blocks=polar_real(this->_eflag,this->_vflag); // only copy answers (forces, energies and virial) back from the device - // in the last kernel (which is polar_real here) + // in the last kernel in a timestep (which is polar_real here) this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); this->device->add_ans_object(this->ans); - this->hd_balancer.stop_timer(); - // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); } diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 9644f5aca4..77450bf7b1 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -69,15 +69,15 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass int init_ok=0; if (world_me==0) init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, - host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_repel, host_special_disp, - host_special_mpole, host_special_polar_wscale, - host_special_polar_piscale, host_special_polar_pscale, - host_sizpr, host_dmppr, host_elepr, - host_csix, host_adisp, host_pcore, host_palpha, - nlocal, nall, max_nbors, - maxspecial, maxspecial15, cell_size, gpu_split, - screen, polar_dscale, polar_uscale); + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); HIPPOMF.device->world_barrier(); if (message) @@ -94,15 +94,15 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass } if (gpu_rank==i && world_me!=0) init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, - host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_repel, host_special_disp, - host_special_mpole, host_special_polar_wscale, - host_special_polar_piscale, host_special_polar_pscale, - host_sizpr, host_dmppr, host_elepr, - host_csix, host_adisp, host_pcore, host_palpha, - nlocal, nall, max_nbors, - maxspecial, maxspecial15, cell_size, gpu_split, - screen, polar_dscale, polar_uscale); + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); HIPPOMF.device->gpu_barrier(); if (message) @@ -121,16 +121,16 @@ void hippo_gpu_clear() { } int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd) { + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, nullptr, nullptr, nullptr, sublo, subhi, tag, @@ -141,17 +141,17 @@ int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, } void hippo_gpu_compute_repulsion(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd, - double cut2, double c0, double c1, double c2, - double c3, double c4, double c5, void **tep_ptr) { + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, sublo, subhi, tag, nspecial, special, nspecial15, special15, From 03e48f26589aebb11752d969bcf25ec750543efc Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 14 Jan 2023 19:51:42 -0600 Subject: [PATCH 133/181] Fixed memory leak in hippo/gpu --- src/GPU/pair_hippo_gpu.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 49a83e75be..915c67e512 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -440,11 +440,8 @@ void PairHippoGPU::induce() // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() - if (use_ewald) { - choose(POLAR_LONG); - int nmine = p_kspace->nfft_owned; - memory->create(qfac,nmine,"ameoba/induce:qfac"); - } else choose(POLAR); + if (use_ewald) choose(POLAR_LONG); + else choose(POLAR); // owned atoms From c21f2faa1f7e4dfa767ecd336a1d3bc3fcb593f2 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 14 Jan 2023 20:02:36 -0600 Subject: [PATCH 134/181] Cleaned up debug statements and unused sections in the amoeba and hippo gpu styles --- src/GPU/pair_amoeba_gpu.cpp | 302 +----------------------------------- src/GPU/pair_hippo_gpu.cpp | 86 +++------- 2 files changed, 22 insertions(+), 366 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index fa0670a757..534ab24085 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -13,7 +13,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Trung Nguyen (Northwestern) + Contributing author: Trung Nguyen (Northwestern/UChicago) ------------------------------------------------------------------------- */ #include "pair_amoeba_gpu.h" @@ -486,8 +486,6 @@ void PairAmoebaGPU::induce() comm->reverse_comm(this); } - //error->all(FLERR,"STOP GPU"); - // set initial conjugate gradient residual and conjugate vector for (i = 0; i < nlocal; i++) { @@ -547,8 +545,6 @@ void PairAmoebaGPU::induce() comm->reverse_comm(this); } - //error->all(FLERR,"STOP"); - for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = vec[i][j]; @@ -1751,166 +1747,6 @@ void PairAmoebaGPU::polar_kspace() } } - // account for dipole response terms in the TCG method - - /* - if (poltyp == TCG) { - - for (m = 0; m < tcgnab; m++) { - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - fuind[i][j] = a[0][j]*uad[i][m][0] + a[1][j]*uad[i][m][1] + - a[2][j]*uad[i][m][2]; - fuinp[i][j] = a[0][j]*ubp[i][m][0] + a[1][j]*ubp[i][m][1] + - a[2][j]*ubp[i][m][2]; - } - } - - grid_uind(fuind,fuinp); - efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); - - for (k = 0; k < nfft3; k++) { - for (j = 0; j < nfft2; j++) { - for (i = 0; i < nfft1; i++) { - term = qfac[k][j][i]; - qgrid[k][j][i][0] *= term; - qgrid[k][j][i][1] *= term; - } - } - } - - efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1); - fphi_uind(fphid,fphip,fphidp); - - for (i = 0; i < nlocal; i++) { - for (j = 1; j < 10; j++) { - fphid[i][j] *= felec; - fphip[i][j] *= felec; - } - } - - for (i = 0; i < nlocal; i++) { - f1 = 0.0; - f2 = 0.0; - f3 = 0.0; - for (k = 0; k < 3; k++) { - j1 = deriv1[k+1]; - j2 = deriv2[k+1]; - j3 = deriv3[k+1]; - f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1]; - f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2]; - f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3]; - } - - f1 *= 0.5 * nfft1; - f2 *= 0.5 * nfft2; - f3 *= 0.5 * nfft3; - h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; - h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; - h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; - f[i][0] -= h1; - f[i][1] -= h2; - f[i][2] -= h3; - - for (j = 1; j < 4; j++) { - cphid[j] = 0.0; - cphip[j] = 0.0; - for (k = 1; k < 4; k++) { - cphid[j] += ftc[j][k]*fphid[i][k]; - cphip[j] += ftc[j][k]*fphip[i][k]; - } - } - - vxx -= 0.5*(cphid[1]*ubp[i][m][0] + cphip[1]*uad[i][m][0]); - vyy -= 0.5*(cphid[2]*ubp[i][m][1] + cphip[2]*uad[i][m][1]); - vzz -= 0.5*(cphid[3]*ubp[i][m][2] + cphip[3]*uad[i][m][2]); - - vxy -= 0.25*(cphid[1]*ubp[i][m][1] + cphip[1]*uad[i][m][1] + - cphid[2]*ubp[i][m][0] + cphip[2]*uad[i][m][0]); - vyz -= 0.25*(cphid[1]*ubp[i][m][2] + cphip[1]*uad[i][m][2] + - cphid[3]*ubp[i][m][0] + cphip[3]*uad[i][m][0]); - vxz -= 0.25*(cphid[2]*ubp[i][m][2] + cphip[2]*uad[i][m][2] + - cphid[3]*ubp[i][m][1] + cphip[3]*uad[i][m][1]); - } - - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - fuind[i][j] = a[0][j]*ubd[i][m][0] + a[1][j]*ubd[i][m][1] + - a[2][j]*ubd[i][m][2]; - fuinp[i][j] = a[0][j]*uap[i][m][0] + a[1][j]*uap[i][m][1] + - a[2][j]*uap[i][m][2]; - } - } - - grid_uind(fuind,fuinp); - efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); - - for (k = 0; k < nfft3; k++) { - for (j = 0; j < nfft2; j++) { - for (i = 0; i < nfft1; i++) { - term = qfac[k][j][i]; - qgrid[k][j][i][0] *= term; - qgrid[k][j][i][1] *= term; - } - } - } - - efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1); - fphi_uind(fphid,fphip,fphidp); - - for (i = 0; i < nlocal; i++) { - for (j = 1; j < 10; j++) { - fphid[i][j] *= felec; - fphip[i][j] *= felec; - } - } - - for (i = 0; i < nlocal; i++) { - f1 = 0.0; - f2 = 0.0; - f3 = 0.0; - for (k = 0; k < 3; k++) { - j1 = deriv1[k+1]; - j2 = deriv2[k+1]; - j3 = deriv3[k+1]; - f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1]; - f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2]; - f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3]; - } - - f1 *= 0.5 * nfft1; - f2 *= 0.5 * nfft2; - f3 *= 0.5 * nfft3; - h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; // matvec - h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; - h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; - f[i][0] -= h1; - f[i][1] -= h2; - f[i][2] -= h3; - - for (j = 1; j < 4; j++) { - cphid[j] = 0.0; - cphip[j] = 0.0; - for (k = 1; k < 4; k++) { - cphid[j] += ftc[j][k]*fphid[i][k]; - cphip[j] += ftc[j][k]*fphip[i][k]; - } - } - - vxx -= 0.5*(cphid[1]*uap[i][m][0] + cphip[1]*ubd[i][m][0]); - vyy -= 0.5*(cphid[2]*uap[i][m][1] + cphip[2]*ubd[i][m][1]); - vzz -= 0.5*(cphid[3]*uap[i][m][2] + cphip[3]*ubd[i][m][2]); - vxy -= 0.25*(cphid[1]*uap[i][m][1] + cphip[1]*ubd[i][m][1] + - cphid[2]*uap[i][m][0] + cphip[2]*ubd[i][m][0]); - vxz -= 0.25*(cphid[1]*uap[i][m][2] + cphip[1]*ubd[i][m][2] + - cphid[3]*uap[i][m][0] + cphip[3]*ubd[i][m][0]); - vyz -= 0.25*(cphid[2]*uap[i][m][2] + cphip[2]*ubd[i][m][2] + - cphid[3]*uap[i][m][1] + cphip[3]*ubd[i][m][1]); - } - } - } - */ - // assign permanent and induced multipoles to the PME grid for (i = 0; i < nlocal; i++) { @@ -2097,142 +1933,6 @@ void PairAmoebaGPU::polar_kspace() } } - // add back missing terms for the TCG polarization method; - // first do the term for "UAD" dotted with "UBP" - - /* - if (poltyp == TCG) { - - for (m = 0; m < tcgnab; m++) { - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 10; j++) - cmp[i][j] = 0.0; - for (j = 1; j < 4; j++) - cmp[i][j] = ubp[i][m][j-1]; - } - - cmp_to_fmp(cmp,fmp); - grid_mpole(fmp); - efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); - - for (k = 0; k < nfft3; k++) { - for (j = 0; j < nfft2; j++) { - for (i = 0; i < nfft1; i++) { - qgrip[k][j][i][0] = qgrid[k][j][i][0]; - qgrip[k][j][i][1] = qgrid[k][j][i][1]; - } - } - } - - for (i = 0; i < nlocal; i++) { - for (j = 1; j < 4; j++) - cmp[i][j] = uad[i][m][j-1]; - } - - cmp_to_fmp(cmp,fmp); - grid_mpole(fmp); - efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); - - // make the scalar summation over reciprocal lattice - // NOTE: this loop has to be distributed for parallel - // NOTE: why does this one include m = 0 ? - - for (m = 1; m < ntot; m++) { - k1 = m % nfft1; - k2 = (m % nff) / nfft1; - k3 = m/nff; - r1 = (k1 >= nf1) ? k1-nfft1 : k1; - r2 = (k2 >= nf2) ? k2-nfft2 : k2; - r3 = (k3 >= nf3) ? k3-nfft3 : k3; - h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; - h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; - h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; - hsq = h1*h1 + h2*h2 + h3*h3; - term = -pterm * hsq; - expterm = 0.0; - if (term > -50.0 && hsq != 0.0) { - denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3]; - expterm = exp(term) / denom; - struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] + - qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1]; - eterm = 0.5 * felec * expterm * struc2; - vterm = (2.0/hsq) * (1.0-term) * eterm; - virpolar[0] -= h1*h1*vterm - eterm; - virpolar[1] -= h2*h2*vterm - eterm; - virpolar[2] -= h3*h3*vterm - eterm; - virpolar[3] -= h1*h2*vterm; - virpolar[4] -= h1*h3*vterm; - virpolar[5] -= h2*h3*vterm; - } - } - - // now do the TCG terms with "UBD" dotted with "UAP" - - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 10; j++) - cmp[i][j] = 0.0; - for (j = 1; j < 4; j++) - cmp[i][j] = uap[i][m][j-1]; - } - - cmp_to_fmp(cmp,fmp); - grid_mpole(fmp); - efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); - - for (k = 0; k < nfft3; k++) { - for (j = 0; j < nfft2; j++) { - for (i = 0; i < nfft1; i++) { - qgrip[k][j][i][0] = qgrid[k][j][i][0]; - qgrip[k][j][i][1] = qgrid[k][j][i][1]; - } - } - } - - for (i = 0; i < nlocal; i++) { - for (j = 1; j < 4; j++) - cmp[i][j] = ubd[i][m][j-1]; - } - - cmp_to_fmp(cmp,fmp); - grid_mpole(fmp); - efft->compute(qgrid[0][0][0],qgrid[0][0][0],1); - - // make the scalar summation over reciprocal lattice - // NOTE: this loop has to be distributed for parallel - // NOTE: why does this one include m = 0 ? - - for (m = 1; m < ntot; m++) { - k1 = m % nfft1; - k2 = (m % nff) / nfft1; - k3 = m/nff; - r1 = (k1 >= nf1) ? k1-nfft1 : k1; - r2 = (k2 >= nf2) ? k2-nfft2 : k2; - r3 = (k3 >= nf3) ? k3-nfft3 : k3; - h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; - h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; - h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; - hsq = h1*h1 + h2*h2 + h3*h3; - term = -pterm * hsq; - expterm = 0.0; - if (term > -50.0 && hsq != 0.0) { - denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3]; - expterm = exp(term) / denom; - struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] + - qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1]; - eterm = 0.5 * felec * expterm * struc2; - vterm = (2.0/hsq) * (1.0-term) * eterm; - virpolar[0] -= h1*h1*vterm - eterm; - virpolar[1] -= h2*h2*vterm - eterm; - virpolar[2] -= h3*h3*vterm - eterm; - virpolar[3] -= h1*h2*vterm; - virpolar[4] -= h1*h3*vterm; - virpolar[5] -= h2*h3*vterm; - } - } - } - } - */ - // increment the total internal virial tensor components if (vflag_global) { diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 915c67e512..61c30c0ad1 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -13,7 +13,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Trung Nguyen (Northwestern) + Contributing author: Trung Nguyen (Northwestern/UChicago) ------------------------------------------------------------------------- */ #include "pair_hippo_gpu.h" @@ -208,14 +208,14 @@ void PairHippoGPU::init_style() int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, - pdamp, thole, dirdamp, amtype2class, - special_repel, special_disp, special_mpole, - special_polar_wscale, special_polar_piscale, - special_polar_pscale, sizpr, dmppr, elepr, - csix, adisp, pcore, palpha, - atom->nlocal, atom->nlocal+atom->nghost, mnf, - maxspecial, maxspecial15, cell_size, gpu_mode, - screen, polar_dscale, polar_uscale, tq_size); + pdamp, thole, dirdamp, amtype2class, + special_repel, special_disp, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, sizpr, dmppr, elepr, + csix, adisp, pcore, palpha, + atom->nlocal, atom->nlocal+atom->nghost, mnf, + maxspecial, maxspecial15, cell_size, gpu_mode, + screen, polar_dscale, polar_uscale, tq_size); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) @@ -271,14 +271,14 @@ void PairHippoGPU::repulsion() inum = atom->nlocal; firstneigh = hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - nullptr, nullptr, nullptr, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, domain->prd); + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); // select the correct cutoff for the term @@ -480,14 +480,6 @@ void PairHippoGPU::induce() } } } -/* - printf("GPU: cutghost = %f\n", comm->cutghost[0]); - for (i = 0; i < 10; i++) { - printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n", - i, udir[i][0], udir[i][1], udir[i][2], - udirp[i][0], udirp[i][1], udirp[i][2]); - } -*/ // allocate memory and make early host-device transfers // must be done before the first ufield0c @@ -611,8 +603,6 @@ void PairHippoGPU::induce() comm->reverse_comm(this); } - //error->all(FLERR,"STOP GPU"); - // set initial conjugate gradient residual and conjugate vector for (i = 0; i < nlocal; i++) { @@ -1022,7 +1012,7 @@ void PairHippoGPU::udirect2b_cpu() tdipdip[ndip++] = bcn[1]*yr*zr; tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; } else { - if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); + } } // jj @@ -1055,16 +1045,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) memset(&field[0][0], 0, 3*nall *sizeof(double)); memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); - -/* - for (int i = 0; i < nall; i++) { - for (int j = 0; j < 3; j++) { - field[i][j] = 0.0; - fieldp[i][j] = 0.0; - } - } -*/ - + // get the real space portion of the mutual field first MPI_Barrier(world); @@ -1086,19 +1067,13 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) field[i][1] += term*uind[i][1]; field[i][2] += term*uind[i][2]; } + for (int i = 0; i < nlocal; i++) { fieldp[i][0] += term*uinp[i][0]; fieldp[i][1] += term*uinp[i][1]; fieldp[i][2] += term*uinp[i][2]; } -/* - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - field[i][j] += term*uind[i][j]; - fieldp[i][j] += term*uinp[i][j]; - } - } -*/ + // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU // field and fieldp may already have some nonzero values from kspace (umutual1 and self) @@ -1271,25 +1246,6 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) fieldp[i][1] -= dfy; fieldp[i][2] -= dfz; } -/* - for (int i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] + - a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3]; - dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] + - a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3]; - } - } - - // increment the field at each multipole site - - for (i = 0; i < nlocal; i++) { - for (j = 0; j < 3; j++) { - field[i][j] -= dipfield1[i][j]; - fieldp[i][j] -= dipfield2[i][j]; - } - } -*/ } /* ---------------------------------------------------------------------- From 67574601ed8bfadb5e4a4139ae52b89399e080b7 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 15 Jan 2023 15:41:54 -0600 Subject: [PATCH 135/181] Cleaned up commented-out and debugging stuffs, removed irrelevant changes to lj/cut/dipole/cut, reverted unwanted changes in the PPPMGPU destructor, fixed unresolved conflicts in tinker.py, updated the userbinsize==0 case in atom.cpp and using Force::pair_match() as suggested. Internal timing stuffs need work. --- cmake/CMakeLists.txt | 1 - examples/amoeba/in.ubiquitin | 2 -- src/AMOEBA/amoeba_induce.cpp | 40 ++---------------------- src/AMOEBA/amoeba_kspace.cpp | 8 ----- src/AMOEBA/fix_amoeba_bitorsion.cpp | 6 ++-- src/AMOEBA/improper_amoeba.cpp | 7 ++--- src/AMOEBA/pair_amoeba.cpp | 7 ----- src/DIPOLE/pair_lj_cut_dipole_cut.cpp | 19 ++---------- src/Depend.sh | 4 +++ src/GPU/pppm_gpu.cpp | 2 ++ src/atom.cpp | 11 +++---- tools/tinker/tinker2lmp.py | 44 --------------------------- 12 files changed, 19 insertions(+), 132 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index d7137c3672..0223750ace 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -395,7 +395,6 @@ endif() pkg_depends(ML-IAP ML-SNAP) pkg_depends(MPIIO MPI) pkg_depends(ATC MANYBODY) -pkg_depends(AMOEBA KSPACE) pkg_depends(LATBOLTZ MPI) pkg_depends(SCAFACOS MPI) pkg_depends(AMOEBA KSPACE) diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index cacb7d3571..cb789a19f8 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -34,8 +34,6 @@ pair_coeff * * amoeba_ubiquitin.prm amoeba_ubiquitin.key special_bonds lj/coul 0.5 0.5 0.5 one/five yes -# setup force components this way so can dump them (AMOEBA or HIPPO also needs them for now) - fix fhal all store/state 0 fx fy fz fix frepulse all store/state 0 fx fy fz fix fdisp all store/state 0 fx fy fz diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 17c4df326d..031173060c 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -86,17 +86,6 @@ void PairAmoeba::induce() crstyle = FIELD; comm->reverse_comm(this); - // DEBUG statements - - /* - for (i = 0; i < nlocal; i++) - if (atom->tag[i] == 1) - printf("AAA FIELD atom %d: field %g %g %g: fieldp %g %g %g\n", - atom->tag[i], - field[i][0],field[i][1],field[i][2], - fieldp[i][0],fieldp[i][1],fieldp[i][2]); - */ - // set induced dipoles to polarizability times direct field for (i = 0; i < nlocal; i++) { @@ -213,16 +202,7 @@ void PairAmoeba::induce() cfstyle = INDUCE; comm->forward_comm(this); -/* - if (comm->me == 0) { - printf("CPU: cutghost = %f\n", comm->cutghost[0]); - for (i = 0; i < 20; i++) { - printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n", - i, uind[i][0], uind[i][1], uind[i][2], - uinp[i][0], uinp[i][1], uinp[i][2]); - } - } -*/ + ufield0c(field,fieldp); crstyle = FIELD; @@ -284,18 +264,6 @@ void PairAmoeba::induce() crstyle = FIELD; comm->reverse_comm(this); - - //error->all(FLERR,"STOP"); -/* - if (comm->me == 0) { - printf("CPU: iter = %d\n", iter); - for (i = 0; i < 10; i++) { - printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n", - i, field[i][0], field[i][1], field[i][2], - fieldp[i][0], fieldp[i][1], fieldp[i][2]); - } - } -*/ for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { @@ -413,8 +381,6 @@ void PairAmoeba::induce() } } - // if (comm->me == 0) printf("CG iteration count = %d\n",iter); - // terminate the calculation if dipoles failed to converge // NOTE: could make this an error @@ -1033,9 +999,7 @@ void PairAmoeba::umutual2b(double **field, double **fieldp) j = jlist[jj]; uindj = uind[j]; uinpj = uinp[j]; - //if (i==0 && j == 10) - // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", - // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); + fid[0] = tdipdip[0]*uindj[0] + tdipdip[1]*uindj[1] + tdipdip[2]*uindj[2]; fid[1] = tdipdip[1]*uindj[0] + tdipdip[3]*uindj[1] + tdipdip[4]*uindj[2]; fid[2] = tdipdip[2]*uindj[0] + tdipdip[4]*uindj[1] + tdipdip[5]*uindj[2]; diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp index 76d13da780..9213b96042 100644 --- a/src/AMOEBA/amoeba_kspace.cpp +++ b/src/AMOEBA/amoeba_kspace.cpp @@ -68,8 +68,6 @@ void PairAmoeba::moduli() int maxfft = MAX(nfft1,nfft2); maxfft = MAX(maxfft,nfft3); - //double *array = new double[bsorder]; - //double *bsarray = new double[maxfft]; if (maxfft > _nfft_max) { memory->destroy(_moduli_bsarray); _nfft_max = maxfft; @@ -79,7 +77,6 @@ void PairAmoeba::moduli() // compute and load the moduli values double x = 0.0; - //bspline(x,bsorder,array); bspline(x,bsorder,_moduli_array); for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0; @@ -88,11 +85,6 @@ void PairAmoeba::moduli() dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder); dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder); dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder); - - // perform deallocation of local arrays - - //delete[] array; - //delete[] bsarray; } /* ---------------------------------------------------------------------- diff --git a/src/AMOEBA/fix_amoeba_bitorsion.cpp b/src/AMOEBA/fix_amoeba_bitorsion.cpp index 6c3c31eec8..cb8c62819d 100644 --- a/src/AMOEBA/fix_amoeba_bitorsion.cpp +++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp @@ -194,10 +194,8 @@ void FixAmoebaBiTorsion::init() // error check that PairAmoeba or PairHiippo exist pair = nullptr; - pair = force->pair_match("amoeba",1,0); - if (!pair) pair = force->pair_match("amoeba/gpu",1,0); - if (!pair) pair = force->pair_match("hippo",1,0); - if (!pair) pair = force->pair_match("hippo/gpu",1,0); + pair = force->pair_match("^amoeba",0,0); + if (!pair) pair = force->pair_match("^hippo",0,0); if (!pair) error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo"); diff --git a/src/AMOEBA/improper_amoeba.cpp b/src/AMOEBA/improper_amoeba.cpp index 136857e74b..cb9db01b59 100644 --- a/src/AMOEBA/improper_amoeba.cpp +++ b/src/AMOEBA/improper_amoeba.cpp @@ -285,10 +285,9 @@ void ImproperAmoeba::init_style() // check if PairAmoeba disabled improper terms Pair *pair = nullptr; - pair = force->pair_match("amoeba",1,0); - if (!pair) pair = force->pair_match("amoeba/gpu",1,0); - if (!pair) pair = force->pair_match("hippo",1,0); - if (!pair) pair = force->pair_match("hippo/gpu",1,0); + pair = force->pair_match("^amoeba",0,0); + if (!pair) pair = force->pair_match("^hippo",0,0); + if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo"); int tmp; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index bb06ecb4a4..2a1a10075c 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -1055,13 +1055,6 @@ void PairAmoeba::init_style() // request standard neighbor list - -// int irequest = neighbor->request(this,instance_me); - - // for DEBUGGING with GPU - //neighbor->requests[irequest]->half = 0; - //neighbor->requests[irequest]->full = 1; - neighbor->add_request(this); } diff --git a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp index 2047eb8b9c..a7e5674a88 100644 --- a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp +++ b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp @@ -90,8 +90,6 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) numneigh = list->numneigh; firstneigh = list->firstneigh; - int maxsize = 10; - // loop over neighbors of my atoms for (ii = 0; ii < inum; ii++) { @@ -104,13 +102,6 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) jlist = firstneigh[i]; jnum = numneigh[i]; - double scale_dipole = 1.0; - if (jnum > maxsize) { - scale_dipole = maxsize; //1.0/(double)maxsize; - } else { - scale_dipole = jnum; //1.0/(double)jnum; - } - for (jj = 0; jj < jnum; jj++) { j = jlist[jj]; factor_lj = special_lj[sbmask(j)]; @@ -216,7 +207,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) // total force - fq = scale_dipole*factor_coul*qqrd2e; + fq = factor_coul*qqrd2e; fx = fq*forcecoulx + delx*forcelj; fy = fq*forcecouly + dely*forcelj; fz = fq*forcecoulz + delz*forcelj; @@ -230,7 +221,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag) torque[i][1] += fq*tiycoul; torque[i][2] += fq*tizcoul; - if (newton_pair) { + if (newton_pair || j < nlocal) { f[j][0] -= fx; f[j][1] -= fy; f[j][2] -= fz; @@ -371,13 +362,7 @@ void PairLJCutDipoleCut::init_style() if (!atom->q_flag || !atom->mu_flag || !atom->torque_flag) error->all(FLERR,"Pair dipole/cut requires atom attributes q, mu, torque"); -<<<<<<< HEAD - int irequest = neighbor->request(this,instance_me); - neighbor->requests[irequest]->half = 0; - neighbor->requests[irequest]->full = 1; -======= neighbor->add_request(this); ->>>>>>> amoeba } /* ---------------------------------------------------------------------- diff --git a/src/Depend.sh b/src/Depend.sh index 6cf613cde7..28ac78d9af 100755 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -45,6 +45,10 @@ depend () { # add one if statement per parent package # add one depend() call per child package that depends on that parent +if (test $1 = "AMOEBA") then + depend GPU +fi + if (test $1 = "ASPHERE") then depend GPU depend OPENMP diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index 4019eb467d..a2a2b0eed8 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -102,6 +102,8 @@ PPPMGPU::PPPMGPU(LAMMPS *lmp) : PPPM(lmp) PPPMGPU::~PPPMGPU() { PPPM_GPU_API(clear)(poisson_time); + destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out); + destroy_3d_offset(vd_brick,nzlo_out,nylo_out); } /* ---------------------------------------------------------------------- diff --git a/src/atom.cpp b/src/atom.cpp index 8b78b4f8f7..0de44e50ca 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -2358,16 +2358,13 @@ void Atom::setup_sort_bins() } #ifdef LMP_GPU - if (userbinsize == 0.0) { - int ifix = modify->find_fix("package_gpu"); - if (ifix >= 0) { + if (userbinsize == 0.0) { + auto ifix = dynamic_cast(modify->get_fix_by_id("package_gpu")); + if (ifix) { const double subx = domain->subhi[0] - domain->sublo[0]; const double suby = domain->subhi[1] - domain->sublo[1]; const double subz = domain->subhi[2] - domain->sublo[2]; - - FixGPU *fix = static_cast(modify->fix[ifix]); - binsize = fix->binsize(subx, suby, subz, atom->nlocal, - 0.5 * neighbor->cutneighmax); + binsize = ifix->binsize(subx, suby, subz, atom->nlocal, 0.5 * neighbor->cutneighmax); } } #endif diff --git a/tools/tinker/tinker2lmp.py b/tools/tinker/tinker2lmp.py index d376593ea3..e3ae59748c 100644 --- a/tools/tinker/tinker2lmp.py +++ b/tools/tinker/tinker2lmp.py @@ -227,11 +227,7 @@ class XYZfile(object): print(i+1,label[i],x[i],y[i],z[i],type[i], end=' ', file=fp) for j in bonds[i]: print(j, end=' ', file=fp) print(file=fp) -<<<<<<< HEAD - -======= ->>>>>>> develop fp.close() # triplet of atoms in an angle = atom 1,2,3 @@ -1098,16 +1094,6 @@ for i,one in enumerate(alist): elif len(params[3]) == 2: nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass) -<<<<<<< HEAD - - if nbonds != 3: - print("Center angle atom has wrong bond count") - print(" angle atom IDs:",atom1,atom2,atom3) - print(" angle atom classes:",c1,c2,c3) - print(" Tinker FF file param options:",len(params[3])) - print(" Nbonds and hydrogen count:",nbonds,hcount) - #sys.exit() NOTE: allow this for now -======= #if nbonds != 3: #print("Center angle atom has wrong bond count") @@ -1117,33 +1103,12 @@ for i,one in enumerate(alist): #print(" Nbonds and hydrogen count:",nbonds,hcount) # NOTE: allow this for now #sys.exit() ->>>>>>> develop if hcount == 0: which = 1 elif hcount == 1: which = 2 m += 1 -<<<<<<< HEAD - print("3-bond angle") - print(" angle atom IDs:",atom1,atom2,atom3) - print(" angle atom classes:",c1,c2,c3) - print(" Tinker FF file param options:",len(params[3])) - print(" Nbonds and hydrogen count:",nbonds,hcount) - print(" which:",which,m) - - elif len(params[3]) == 3: - nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass) - - if nbonds != 4: - print("Center angle atom has wrong bond count") - print(" angle atom IDs:",atom1,atom2,atom3) - print(" angle atom classes:",c1,c2,c3) - print(" Tinker FF file param options:",len(params[3])) - print(" Nbonds and hydrogen count:",nbonds,hcount) - #sys.exit() NOTE: allow this for now - -======= #print("3-bond angle") #print(" angle atom IDs:",atom1,atom2,atom3) #print(" angle atom classes:",c1,c2,c3) @@ -1163,7 +1128,6 @@ for i,one in enumerate(alist): # NOTE: allow this for now #sys.exit() ->>>>>>> develop if hcount == 0: which = 1 elif hcount == 1: which = 2 @@ -1207,12 +1171,8 @@ for itype in range(len(aparams)): elif (c3,c2,c1) in badict: n1,n2,r1,r2 = badict[(c3,c2,c1)] else: -<<<<<<< HEAD - print("Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3)) -======= # NOTE: just for debugging #print("Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3)) ->>>>>>> develop n1,n2,r1,r2 = 4*[0.0] baparams.append((n1,n2,r1,r2)) @@ -1670,11 +1630,7 @@ print("Natoms =",natoms) print("Ntypes =",ntypes) print("Tinker XYZ types =",len(tink2lmp)) print("Tinker PRM types =",prm.ntypes) -<<<<<<< HEAD -#print "Tinker groups =",ngroups -======= #print("Tinker groups =",ngroups) ->>>>>>> develop print("Nmol =",nmol) print("Nbonds =",nbonds) print("Nangles =",nangles) From d5b878d04726164381178fe6d9e2ebdee10c8d07 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 15 Jan 2023 15:56:40 -0600 Subject: [PATCH 136/181] Updated the doc page of amoeba/hippo styles to indicate that their gpu versions are supported --- doc/src/pair_amoeba.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst index f5c0ea14df..94c956a585 100644 --- a/doc/src/pair_amoeba.rst +++ b/doc/src/pair_amoeba.rst @@ -1,11 +1,18 @@ .. index:: pair_style amoeba +.. index:: pair_style amoeba/gpu .. index:: pair_style hippo +.. index:: pair_style hippo/gpu pair_style amoeba command ========================= +Accelerator Variants: *amoeba/gpu* + pair_style hippo command ======================== + +Accelerator Variants: *hippo/gpu* + Syntax """""" @@ -187,6 +194,10 @@ These pair styles can only be used via the *pair* keyword of the ---------- +.. include:: accel_styles.rst + +---------- + Restrictions """""""""""" From c9ae41246d45cc29ba3e68f715809cc6b3028617 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 15 Jan 2023 16:05:36 -0600 Subject: [PATCH 137/181] Ran the four make commands in the src folder: make fix-whitespace; make fix-homepage; make fix-errordocs; make fix-permissions --- lib/gpu/lal_atom.h | 2 +- lib/gpu/lal_base_amoeba.cpp | 18 +++++++++--------- lib/gpu/lal_base_amoeba.h | 4 ++-- lib/gpu/lal_hippo.cpp | 4 ++-- lib/gpu/lal_hippo.h | 2 +- lib/gpu/lal_hippo_extra.h | 2 +- lib/gpu/lal_neighbor.cpp | 4 ++-- lib/gpu/lal_neighbor.h | 2 +- src/AMOEBA/amoeba_kspace.cpp | 2 +- src/AMOEBA/amoeba_multipole.cpp | 4 ++-- src/AMOEBA/pair_amoeba.cpp | 6 +++--- src/AMOEBA/pair_amoeba.h | 4 ++-- src/GPU/amoeba_convolution_gpu.cpp | 2 +- src/GPU/amoeba_convolution_gpu.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 28 ++++++++++++++-------------- src/GPU/pair_amoeba_gpu.h | 15 +-------------- src/GPU/pair_hippo_gpu.cpp | 16 ++++++++-------- src/GPU/pair_hippo_gpu.h | 15 +-------------- 18 files changed, 53 insertions(+), 79 deletions(-) diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index bec1ad38cc..142d64ef1d 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -285,7 +285,7 @@ class Atom { /// Signal that we need to transfer atom data for next timestep inline void data_unavail() { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; } - + /// Signal that we need to transfer atom extra data for next kernel call inline void extra_data_unavail() { _extra_avail=false; } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index a9c76d578e..a1d4a00c2c 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -42,7 +42,7 @@ BaseAmoebaT::~BaseAmoeba() { k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); - + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) if (fft_plan_created) cufftDestroy(plan); #endif @@ -365,7 +365,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall // --------------------------------------------------------------------------- // Compute multipole real-space part // precompute() should be already invoked before mem (re)allocation -// this is the first part in a time step done on the GPU for AMOEBA for now +// this is the first part in a time step done on the GPU for AMOEBA for now // --------------------------------------------------------------------------- template void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, @@ -418,7 +418,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); - + *fieldp_ptr=_fieldp.host.begin(); // specify the correct cutoff and alpha values @@ -443,7 +443,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // only copy the necessary data arrays that are updated over the iterations // use nullptr for the other arrays that are already copied from host to device cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr); - atom->add_extra_data(); + atom->add_extra_data(); // set the correct cutoff and alpha _off2_polar = off2_polar; @@ -648,7 +648,7 @@ int BaseAmoebaT::fphi_uind() { int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, - &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum, + &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum, &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); time_pair.stop(); @@ -738,7 +738,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, // cast necessary data arrays from host to device cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); - atom->add_extra_data(); + atom->add_extra_data(); *tep_ptr=_tep.host.begin(); @@ -784,7 +784,7 @@ template void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode) { // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) - #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) + #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) if (fft_plan_created == false) { int m = numel/2; cufftPlan1d(&plan, m, CUFFT_Z2Z, 1); @@ -793,7 +793,7 @@ void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int // n = number of double complex int n = numel/2; - + // copy the host array to the device (data) UCL_Vector data; data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE); @@ -807,7 +807,7 @@ void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int data.update_device(false); // perform the in-place forward FFT - + cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device, (cufftDoubleComplex*)&data.device, CUFFT_FORWARD); if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 0fb2469d23..a20c3886d5 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -256,7 +256,7 @@ class BaseAmoeba { int _ngridx, _ngridy, _ngridz, _num_grid_points; int _end_command_queue; - + // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; @@ -312,7 +312,7 @@ class BaseAmoeba { virtual int fphi_uind(); virtual int fphi_mpole(); virtual int polar_real(const int eflag, const int vflag) = 0; - + #if !defined(USE_OPENCL) && !defined(USE_HIP) cufftHandle plan; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 334d75ac26..f8ab436ad0 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -597,11 +597,11 @@ int HippoT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - + const int BX=this->block_size(); const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); /* - const int cus = this->device->gpu->cus(); + const int cus = this->device->gpu->cus(); while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h index 671c9964ff..4780ab8ea9 100644 --- a/lib/gpu/lal_hippo.h +++ b/lib/gpu/lal_hippo.h @@ -124,7 +124,7 @@ class Hippo : public BaseAmoeba { UCL_D_Vec coeff_amtype; /// csix = coeff_amclass.x; adisp = coeff_amclass.y; UCL_D_Vec coeff_amclass; - /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z; + /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z; UCL_D_Vec coeff_rep; /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h index ac02e2e9e8..7ff62aa9a4 100644 --- a/lib/gpu/lal_hippo_extra.h +++ b/lib/gpu/lal_hippo_extra.h @@ -173,7 +173,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1, dmpik[4] = pre * (s*d2s + ds*ds); dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s); dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s); - + if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s); } diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 983cea307a..10816e2fa6 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -684,7 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, if (_cutoff < _cell_size) vadjust*=1.46; mn=std::max(mn,static_cast(ceil(_max_neighbor_factor*vadjust*mn))); if (mn<33) mn+=3; - + resize_max_neighbors(mn,success); set_nbor_block_size(mn/2); if (!success) @@ -837,7 +837,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_nbor.stop(); } -void Neighbor::transpose(UCL_D_Vec &out, const UCL_D_Vec &in, +void Neighbor::transpose(UCL_D_Vec &out, const UCL_D_Vec &in, const int columns_in, const int rows_in) { const int b2x=_block_cell_2d; diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 9ea02b0b40..9061ce5150 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -260,7 +260,7 @@ class Neighbor { } /// Helper function - void transpose(UCL_D_Vec &out, const UCL_D_Vec &in, + void transpose(UCL_D_Vec &out, const UCL_D_Vec &in, const int columns_in, const int rows_in); private: diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp index 9213b96042..c47e734c5e 100644 --- a/src/AMOEBA/amoeba_kspace.cpp +++ b/src/AMOEBA/amoeba_kspace.cpp @@ -73,7 +73,7 @@ void PairAmoeba::moduli() _nfft_max = maxfft; memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray"); } - + // compute and load the moduli values double x = 0.0; diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index 3b5dbbed51..7269128080 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -419,7 +419,7 @@ void PairAmoeba::multipole_real() term2i*rr3i + term2k*rr3k + term2ik*rr3ik + term3i*rr5i + term3k*rr5k + term3ik*rr5ik; - + // find damped multipole intermediates for force and torque @@ -465,7 +465,7 @@ void PairAmoeba::multipole_real() term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9); term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9); term6 = 4.0 * rr7; - + } empole += e; diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 2a1a10075c..df9472e188 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -570,7 +570,7 @@ void PairAmoeba::finish() double time_mutual_fft = ic_kspace->time_fft; MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_mutual_fft = ave/comm->nprocs; - + double time_total = (time_init + time_hal + time_repulse + time_disp + time_mpole + time_induce + time_polar + time_qxfer) / 100.0; @@ -597,7 +597,7 @@ void PairAmoeba::finish() utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total); - utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); + utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); utils::logmesg(lmp," K-space timing breakdown : {:.3g}%\n", kspace_time/time_total); utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); @@ -606,7 +606,7 @@ void PairAmoeba::finish() utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); - + } } diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 1bb3212df8..f14be4bd11 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -347,8 +347,8 @@ class PairAmoeba : public Pair { class AmoebaConvolution *m_kspace; // multipole KSpace class AmoebaConvolution *p_kspace; // polar KSpace - class AmoebaConvolution *pc_kspace; - class AmoebaConvolution *d_kspace; // dispersion KSpace + class AmoebaConvolution *pc_kspace; + class AmoebaConvolution *d_kspace; // dispersion KSpace class AmoebaConvolution *i_kspace; // induce KSpace class AmoebaConvolution *ic_kspace; diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp index 0284791d38..fd4aece6c8 100644 --- a/src/GPU/amoeba_convolution_gpu.cpp +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -1,7 +1,7 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/ Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains diff --git a/src/GPU/amoeba_convolution_gpu.h b/src/GPU/amoeba_convolution_gpu.h index c446995b4a..4286f2155f 100644 --- a/src/GPU/amoeba_convolution_gpu.h +++ b/src/GPU/amoeba_convolution_gpu.h @@ -1,7 +1,7 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/ Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 534ab24085..713015b5c5 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -2,7 +2,7 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -283,7 +283,7 @@ void PairAmoebaGPU::multipole_real() success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); - + // reference to the tep array from GPU lib @@ -400,7 +400,7 @@ void PairAmoebaGPU::induce() } for (i = 0; i < nlocal; i++) { - itype = amtype[i]; + itype = amtype[i]; for (j = 0; j < 3; j++) { uopt[i][m][j] = polarity[itype] * field[i][j]; uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; @@ -666,7 +666,7 @@ void PairAmoebaGPU::induce() if (iter >= maxiter || eps > epsold) if (comm->me == 0) - error->warning(FLERR,"AMOEBA induced dipoles did not converge"); + error->warning(FLERR,"AMOEBA induced dipoles did not converge"); } // update the lists of previous induced dipole values @@ -958,7 +958,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (umutual1 and self) amoeba_gpu_update_fieldp(&fieldp_pinned); - + int inum = atom->nlocal; double *field_ptr = (double *)fieldp_pinned; @@ -1015,8 +1015,8 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; } - - for (int i = 0; i < nlocal; i++) { + + for (int i = 0; i < nlocal; i++) { fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; @@ -1037,7 +1037,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) time1 = MPI_Wtime(); time_grid_uind += (time1 - time0); - + // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition @@ -1137,7 +1137,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_sum_phi_pinned = nullptr; amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned); - + int nlocal = atom->nlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { @@ -1356,7 +1356,7 @@ void PairAmoebaGPU::polar_kspace() bspline_fill(); // allocate memory and make early host-device transfers - + // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill if (gpu_fphi_mpole_ready) { amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, @@ -1365,7 +1365,7 @@ void PairAmoebaGPU::polar_kspace() p_kspace->nylo_out, p_kspace->nyhi_out, p_kspace->nxlo_out, p_kspace->nxhi_out); } - + // convert Cartesian multipoles to fractional coordinates @@ -1435,7 +1435,7 @@ void PairAmoebaGPU::polar_kspace() double ***gridpost = (double ***) p_kspace->post_convolution(); // get potential - + if (!gpu_fphi_mpole_ready) { fphi_mpole(gridpost,fphi); @@ -1447,7 +1447,7 @@ void PairAmoebaGPU::polar_kspace() } else { void* fphi_pinned = nullptr; amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); - + double *_fphi_ptr = (double *)fphi_pinned; for (int i = 0; i < nlocal; i++) { int idx = i; @@ -1457,7 +1457,7 @@ void PairAmoebaGPU::polar_kspace() } } - } + } // convert field from fractional to Cartesian diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 420874df21..b7230594c5 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -1,7 +1,7 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -71,16 +71,3 @@ class PairAmoebaGPU : public PairAmoeba { } // namespace LAMMPS_NS #endif #endif - -/* ERROR/WARNING messages: - -E: Insufficient memory on accelerator - -There is insufficient memory on one of the devices specified for the gpu -package - -E: Pair style amoeba/gpu requires atom attribute q - -The atom style defined does not have this attribute. - -*/ diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 61c30c0ad1..bf3e113ea7 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -2,7 +2,7 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -517,7 +517,7 @@ void PairHippoGPU::induce() } for (i = 0; i < nlocal; i++) { - itype = amtype[i]; + itype = amtype[i]; for (j = 0; j < 3; j++) { uopt[i][m][j] = polarity[itype] * field[i][j]; uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; @@ -785,7 +785,7 @@ void PairHippoGPU::induce() if (iter >= maxiter || eps > epsold) if (comm->me == 0) - error->warning(FLERR,"HIPPO induced dipoles did not converge"); + error->warning(FLERR,"HIPPO induced dipoles did not converge"); } // update the lists of previous induced dipole values @@ -1045,7 +1045,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) memset(&field[0][0], 0, 3*nall *sizeof(double)); memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); - + // get the real space portion of the mutual field first MPI_Barrier(world); @@ -1078,7 +1078,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (umutual1 and self) hippo_gpu_update_fieldp(&fieldp_pinned); - + int inum = atom->nlocal; double *field_ptr = (double *)fieldp_pinned; @@ -1136,8 +1136,8 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; } - - for (int i = 0; i < nlocal; i++) { + + for (int i = 0; i < nlocal; i++) { fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; @@ -1266,7 +1266,7 @@ void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_sum_phi_pinned = nullptr; hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned); - + int nlocal = atom->nlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index b1b908411d..44bebd29f3 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -1,7 +1,7 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS Development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -71,16 +71,3 @@ class PairHippoGPU : public PairAmoeba { } // namespace LAMMPS_NS #endif #endif - -/* ERROR/WARNING messages: - -E: Insufficient memory on accelerator - -There is insufficient memory on one of the devices specified for the gpu -package - -E: Pair style hippo/gpu requires atom attribute q - -The atom style defined does not have this attribute. - -*/ From 88e1ce33799ba875f8f94506c2ecd8e5fbf64ddd Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 15 Jan 2023 17:42:16 -0500 Subject: [PATCH 138/181] flag GPU acceleration --- doc/src/Commands_pair.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 59501b4a56..d9bbe590ef 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -39,7 +39,7 @@ OPT. * :doc:`agni (o) ` * :doc:`airebo (io) ` * :doc:`airebo/morse (io) ` - * :doc:`amoeba ` + * :doc:`amoeba (g) ` * :doc:`atm ` * :doc:`awpmd/cut ` * :doc:`beck (go) ` @@ -126,7 +126,7 @@ OPT. * :doc:`hbond/dreiding/lj (o) ` * :doc:`hbond/dreiding/morse (o) ` * :doc:`hdnnp ` - * :doc:`hippo ` + * :doc:`hippo (g) ` * :doc:`ilp/graphene/hbn (t) ` * :doc:`ilp/tmd (t) ` * :doc:`kolmogorov/crespi/full ` From 6ce7ea2f4bef772348b6dc7a811834afa610bb7b Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 15 Jan 2023 17:43:15 -0500 Subject: [PATCH 139/181] remove obsolete commands --- examples/amoeba/in.ubiquitin | 7 ------- 1 file changed, 7 deletions(-) diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index cb789a19f8..4c47edfcfc 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -34,13 +34,6 @@ pair_coeff * * amoeba_ubiquitin.prm amoeba_ubiquitin.key special_bonds lj/coul 0.5 0.5 0.5 one/five yes -fix fhal all store/state 0 fx fy fz -fix frepulse all store/state 0 fx fy fz -fix fdisp all store/state 0 fx fy fz -fix fpolar all store/state 0 fx fy fz -fix fmpole all store/state 0 fx fy fz -fix fqxfer all store/state 0 fx fy fz - # thermo output compute virial all pressure NULL virial From 62c010a7dee66fe8552ce547b2a67d65f7c7312c Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 15 Jan 2023 18:11:33 -0500 Subject: [PATCH 140/181] add note to insert LAMMPS version when GPU acceleration was added --- doc/src/pair_amoeba.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst index 94c956a585..113ae560f7 100644 --- a/doc/src/pair_amoeba.rst +++ b/doc/src/pair_amoeba.rst @@ -134,6 +134,10 @@ version discussed in :ref:`(Ponder) `, :ref:`(Ren) implementation of HIPPO in LAMMPS matches the version discussed in :ref:`(Rackers) `. +.. versionadded:: TBD + +Accelerator support via the GPU package is available. + ---------- Only a single pair_coeff command is used with either the *amoeba* and From 9dc0369cee0649047ccf16cc0e9a3ab941b5ed07 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 15 Jan 2023 23:28:48 -0600 Subject: [PATCH 141/181] Attempted to resolve the address space change issue when casting for OpenCL 2.0 (ref: https://www.intel.com/content/www/us/en/developer/articles/technical/the-generic-address-space-in-opencl-20.html#06_address_space_casting) --- lib/gpu/lal_device.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index cdac6dfc97..1dbe1a0c40 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args) } _ocl_compile_string="-cl-mad-enable "; + #ifdef CL_VERSION_2_0 + _ocl_compile_string+="-cl-std=CL2.0 "; + #endif if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math "; _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+ std::string(OCL_PRECISION_COMPILE); From 973b46a90709694f879fb6515cea010b63f499c3 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 16 Jan 2023 10:12:42 -0600 Subject: [PATCH 142/181] Attempted to resolve the memory access runtime errors when acquiring single and mixed precision arrays from the GPU lib --- lib/gpu/lal_amoeba.cu | 10 +- lib/gpu/lal_base_amoeba.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 199 +++++++++++++++++++++++++----------- 3 files changed, 146 insertions(+), 65 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index cc593e4263..b3bbabadc3 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1631,9 +1631,9 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, const __global numtyp2 *restrict grid, - __global numtyp *restrict fdip_phi1, - __global numtyp *restrict fdip_phi2, - __global numtyp *restrict fdip_sum_phi, + __global acctyp *restrict fdip_phi1, + __global acctyp *restrict fdip_phi2, + __global acctyp *restrict fdip_sum_phi, const int bsorder, const int inum, const int nzlo_out, const int nylo_out, const int nxlo_out, const int ngridxy, @@ -1843,7 +1843,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, } int idx; - numtyp fdip_buf[20]; + acctyp fdip_buf[20]; fdip_buf[0] = (numtyp)0.0; fdip_buf[1] = tuv100_1; @@ -1917,7 +1917,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, const __global numtyp2 *restrict grid, - __global numtyp *restrict fphi, + __global acctyp *restrict fphi, const int bsorder, const int inum, const numtyp felec, const int nzlo_out, const int nylo_out, const int nxlo_out, const int ngridxy, diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a20c3886d5..a7f98fa5be 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -250,7 +250,7 @@ class BaseAmoeba { UCL_Vector _thetai1, _thetai2, _thetai3; UCL_Vector _igrid; UCL_Vector _cgrid_brick; - UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; + UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _ngridx, _ngridy, _ngridz, _num_grid_points; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 713015b5c5..d3d4103953 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -203,10 +203,7 @@ void PairAmoebaGPU::init_style() if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); - if (tq_size == sizeof(double)) - tq_single = false; - else - tq_single = true; + tq_single = tq_size != sizeof(double); // replace with the gpu counterpart @@ -739,23 +736,44 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (udirect1) int nlocal = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; + if (tq_single) { + auto field_ptr = (float *)fieldp_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + auto fieldp_ptr = (float *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + auto fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } } + } @@ -960,23 +978,44 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) amoeba_gpu_update_fieldp(&fieldp_pinned); int inum = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; + if (tq_single) { + auto field_ptr = (float *)fieldp_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + auto fieldp_ptr = (float *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + } else { + auto field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + auto fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } } + // accumulate timing information @@ -1139,32 +1178,63 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, &fdip_sum_phi_pinned); int nlocal = atom->nlocal; - double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 10; m++) { - fdip_phi1[i][m] = _fdip_phi1_ptr[n]; - n += nlocal; + if (tq_single) { + auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } } - } - double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 10; m++) { - fdip_phi2[i][m] = _fdip_phi2_ptr[n]; - n += nlocal; + auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } } - } - double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 20; m++) { - fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; - n += nlocal; + auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + + } else { + auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } } } + } /* ---------------------------------------------------------------------- @@ -1447,15 +1517,26 @@ void PairAmoebaGPU::polar_kspace() } else { void* fphi_pinned = nullptr; amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); - - double *_fphi_ptr = (double *)fphi_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = i; - for (int m = 0; m < 20; m++) { - fphi[i][m] = _fphi_ptr[idx]; - idx += nlocal; + if (tq_single) { + auto _fphi_ptr = (float *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + } else { + auto _fphi_ptr = (double *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } } } + } From b3e45c29cafe2fdf742cbd28d2423bd440a0e184 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 16 Jan 2023 10:30:03 -0600 Subject: [PATCH 143/181] Removed whitespaces --- src/GPU/pair_amoeba_gpu.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index d3d4103953..6a2f87ba2e 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -773,7 +773,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) fieldp[i][2] += fieldp_ptr[idx+2]; } } - + } @@ -1015,7 +1015,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) fieldp[i][2] += fieldp_ptr[idx+2]; } } - + // accumulate timing information @@ -1234,7 +1234,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, } } } - + } /* ---------------------------------------------------------------------- @@ -1525,7 +1525,7 @@ void PairAmoebaGPU::polar_kspace() fphi[i][m] = _fphi_ptr[idx]; idx += nlocal; } - } + } } else { auto _fphi_ptr = (double *)fphi_pinned; for (int i = 0; i < nlocal; i++) { @@ -1536,8 +1536,6 @@ void PairAmoebaGPU::polar_kspace() } } } - - } // convert field from fractional to Cartesian From 9ab7f792e120868c23aca85ba31897d18fffaedc Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 16 Jan 2023 22:29:04 -0600 Subject: [PATCH 144/181] Fixed nullptr bug in the mutual fft timer --- src/AMOEBA/pair_amoeba.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index df9472e188..677bc48344 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -567,7 +567,8 @@ void PairAmoeba::finish() MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_fphi_uind = ave/comm->nprocs; - double time_mutual_fft = ic_kspace->time_fft; + double time_mutual_fft = 0; + if (ic_kspace) time_mutual_fft = ic_kspace->time_fft; MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_mutual_fft = ave/comm->nprocs; From 28fbc2631b15888de1a70a35bf0689a817c796ef Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 16 Jan 2023 22:33:21 -0600 Subject: [PATCH 145/181] Fixed another bug with ic_kspace being nullptr --- src/GPU/pair_amoeba_gpu.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 6a2f87ba2e..8db2a901da 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -364,11 +364,12 @@ void PairAmoebaGPU::induce() // must be done before the first ufield0c // NOTE: this is for ic_kspace, and thetai[1-3] - amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, - thetai3, igrid, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out); + if (ic_kspace) + amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating From b59ee8d16c190efd87a1025efc05117a4405f4e8 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 17 Jan 2023 03:31:59 -0500 Subject: [PATCH 146/181] silence compiler warnings --- src/GPU/pair_amoeba_gpu.cpp | 59 ++++++++++-------------------- src/GPU/pair_hippo_gpu.cpp | 72 ++++++++++++------------------------- 2 files changed, 41 insertions(+), 90 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 8db2a901da..5bc2b3a48c 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -234,7 +234,7 @@ void PairAmoebaGPU::multipole_real() int inum, host_start; bool success = true; - int *ilist, *numneigh, **firstneigh; + int *ilist, *numneigh; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -249,15 +249,15 @@ void PairAmoebaGPU::multipole_real() } inum = atom->nlocal; - firstneigh = amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - nullptr, nullptr, nullptr, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, domain->prd); + amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); @@ -303,7 +303,7 @@ void PairAmoebaGPU::multipole_real() void PairAmoebaGPU::induce() { bool done; - int i,j,m,ii,itype; + int i,j,m,itype; int iter,maxiter; double polmin; double eps,epsold; @@ -313,9 +313,6 @@ void PairAmoebaGPU::induce() double sum,sump,term; double reduce[4],allreduce[4]; - int debug = 1; - - // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -702,11 +699,9 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) return; } - int eflag=1, vflag=1; - int nall = atom->nlocal + atom->nghost; - int inum, host_start; - + int inum; double sublo[3],subhi[3]; + if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; @@ -786,19 +781,19 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) void PairAmoebaGPU::udirect2b_cpu() { - int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup; + int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup; double xr,yr,zr,r,r2; double rr1,rr2,rr3,rr5; double bfac,exp2a; double ralpha,aefac; double aesq2,aesq2n; - double pdi,pti,ddi; + double pdi,pti; double pgamma; double damp,expdamp; double scale3,scale5; - double scale7,scalek; + double scalek; double bn[4],bcn[3]; - double factor_dscale,factor_pscale,factor_uscale,factor_wscale; + double factor_uscale; int inum,jnum; int *ilist,*jlist,*numneigh,**firstneigh; @@ -839,7 +834,6 @@ void PairAmoebaGPU::udirect2b_cpu() pdi = pdamp[itype]; pti = thole[itype]; - ddi = dirdamp[itype]; // evaluate all sites within the cutoff distance @@ -856,15 +850,8 @@ void PairAmoebaGPU::udirect2b_cpu() jtype = amtype[j]; jgroup = amgroup[j]; - factor_wscale = special_polar_wscale[sbmask15(jextra)]; - if (igroup == jgroup) { - factor_pscale = special_polar_piscale[sbmask15(jextra)]; - factor_dscale = polar_dscale; - factor_uscale = polar_uscale; - } else { - factor_pscale = special_polar_pscale[sbmask15(jextra)]; - factor_dscale = factor_uscale = 1.0; - } + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = 1.0; r = sqrt(r2); rr1 = 1.0 / r; @@ -1251,10 +1238,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) return; } - int eflag=1, vflag=1; - int nall = atom->nlocal + atom->nghost; - int inum; - double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -1266,7 +1249,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } - inum = atom->nlocal; // select the correct cutoff (off2) for the term @@ -1291,8 +1273,6 @@ void PairAmoebaGPU::polar_real() int eflag=1, vflag=1; double **f = atom->f; - int nall = atom->nlocal + atom->nghost; - int inum; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -1305,7 +1285,6 @@ void PairAmoebaGPU::polar_real() } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } - inum = atom->nlocal; // select the correct cutoff and aewald for the term diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index bf3e113ea7..1f0f3e820a 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -255,7 +255,7 @@ void PairHippoGPU::repulsion() int inum, host_start; bool success = true; - int *ilist, *numneigh, **firstneigh; + int *ilist, *numneigh; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -270,15 +270,15 @@ void PairHippoGPU::repulsion() } inum = atom->nlocal; - firstneigh = hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - nullptr, nullptr, nullptr, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, atom->q, domain->boxlo, domain->prd); + hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + nullptr, nullptr, nullptr, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); // select the correct cutoff for the term @@ -321,13 +321,8 @@ void PairHippoGPU::dispersion_real() return; } - int eflag=1, vflag=1; - int nall = atom->nlocal + atom->nghost; - int inum, host_start; - - int *ilist, *numneigh, **firstneigh; - double sublo[3],subhi[3]; + if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; @@ -338,7 +333,6 @@ void PairHippoGPU::dispersion_real() } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } - inum = atom->nlocal; // select the correct cutoff for the term @@ -366,7 +360,7 @@ void PairHippoGPU::multipole_real() int inum, host_start; bool success = true; - int *ilist, *numneigh, **firstneigh; + int *ilist, *numneigh; double sublo[3],subhi[3]; if (domain->triclinic == 0) { @@ -425,7 +419,7 @@ void PairHippoGPU::multipole_real() void PairHippoGPU::induce() { bool done; - int i,j,m,ii,itype; + int i,j,m,itype; int iter,maxiter; double polmin; double eps,epsold; @@ -435,8 +429,6 @@ void PairHippoGPU::induce() double sum,sump,term; double reduce[4],allreduce[4]; - int debug = 1; - // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -823,11 +815,9 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) return; } - int eflag=1, vflag=1; - int nall = atom->nlocal + atom->nghost; - int inum, host_start; - + int inum; double sublo[3],subhi[3]; + if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; @@ -887,19 +877,18 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) void PairHippoGPU::udirect2b_cpu() { - int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup; + int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup; double xr,yr,zr,r,r2; double rr1,rr2,rr3,rr5; double bfac,exp2a; double ralpha,aefac; double aesq2,aesq2n; - double pdi,pti,ddi; + double pdi,pti; double pgamma; double damp,expdamp; - double scale3,scale5; - double scale7,scalek; + double scale3,scale5,scalek; double bn[4],bcn[3]; - double factor_dscale,factor_pscale,factor_uscale,factor_wscale; + double factor_uscale; int inum,jnum; int *ilist,*jlist,*numneigh,**firstneigh; @@ -940,7 +929,6 @@ void PairHippoGPU::udirect2b_cpu() pdi = pdamp[itype]; pti = thole[itype]; - ddi = dirdamp[itype]; // evaluate all sites within the cutoff distance @@ -957,15 +945,8 @@ void PairHippoGPU::udirect2b_cpu() jtype = amtype[j]; jgroup = amgroup[j]; - factor_wscale = special_polar_wscale[sbmask15(jextra)]; - if (igroup == jgroup) { - factor_pscale = special_polar_piscale[sbmask15(jextra)]; - factor_dscale = polar_dscale; - factor_uscale = polar_uscale; - } else { - factor_pscale = special_polar_pscale[sbmask15(jextra)]; - factor_dscale = factor_uscale = 1.0; - } + if (igroup == jgroup) factor_uscale = polar_uscale; + else factor_uscale = 1.0; r = sqrt(r2); rr1 = 1.0 / r; @@ -1033,7 +1014,6 @@ void PairHippoGPU::udirect2b_cpu() void PairHippoGPU::ufield0c(double **field, double **fieldp) { - int i,j; double term; double time0,time1,time2; @@ -1309,10 +1289,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) return; } - int eflag=1, vflag=1; - int nall = atom->nlocal + atom->nghost; - int inum; - double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; @@ -1324,7 +1300,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp) } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } - inum = atom->nlocal; // select the correct cutoff (off2) for the term @@ -1350,10 +1325,8 @@ void PairHippoGPU::polar_real() int eflag=1, vflag=1; double **f = atom->f; - int nall = atom->nlocal + atom->nghost; - int inum; - double sublo[3],subhi[3]; + if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; @@ -1364,7 +1337,6 @@ void PairHippoGPU::polar_real() } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } - inum = atom->nlocal; // select the correct cutoff and aewald for the term From 71931d1d44ed52438d1712ed93fa6a16cfcb94b0 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 17 Jan 2023 09:39:03 -0600 Subject: [PATCH 147/181] Cleaned up, and added missing zero timers for extra fields transfers --- lib/gpu/lal_atom.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 142d64ef1d..cfd4368948 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -108,7 +108,7 @@ class Atom { bool velocity() { return _vel; } /// Returns true if GPU is using extra fields - bool using_extra() { return _extra_fields; } + bool using_extra() { return (_extra_fields>0); } /// Only free matrices of length inum or nall for resizing void clear_resize(); @@ -131,6 +131,8 @@ class Atom { time_quat.add_to_total(); if (_vel) time_vel.add_to_total(); + if (_extra_fields>0) + time_extra.add_to_total(); } /// Add copy times to timers @@ -142,6 +144,8 @@ class Atom { time_quat.zero(); if (_vel) time_vel.zero(); + if (_extra_fields>0) + time_extra.zero(); } /// Return the total time for host/device data transfer @@ -161,6 +165,10 @@ class Atom { total+=time_vel.total_seconds(); time_vel.zero_total(); } + if (_extra_fields>0) { + total+=time_extra.total_seconds(); + time_extra.zero_total(); + } return total+_time_transfer/1000.0; } From f86375c992bf47f659cf2944b49fda3a9689f464 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 17 Jan 2023 09:47:09 -0600 Subject: [PATCH 148/181] Attempted to ensure that extra gets allocated in the exactly same way as other added fields (charge, quat and vel) --- lib/gpu/lal_atom.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index f195bf5287..03f3b477c9 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -124,7 +124,7 @@ bool AtomT::alloc(const int nall) { UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=v.device.row_bytes(); } - if (_extra_fields>0 && _host_view==false) { + if (_extra_fields>0 && !_host_view) { success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=extra.device.row_bytes(); From eddd3d6f254c553086863c68c1f638731cc699be Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 18 Jan 2023 20:04:45 -0600 Subject: [PATCH 149/181] Fixed a bug with extra being nullptr when _host_view is true: always allocate extra (Note that BaseAmoeba has its own cast_extra_data() that doesn't know if extra is allocated properly, it is the case when _host_view is false for dedicated GPUs for example) --- lib/gpu/lal_atom.cpp | 2 +- lib/gpu/lal_atom.h | 19 +++++++------------ lib/gpu/lal_device.cpp | 2 +- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index 03f3b477c9..bf27334578 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -124,7 +124,7 @@ bool AtomT::alloc(const int nall) { UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=v.device.row_bytes(); } - if (_extra_fields>0 && !_host_view) { + if (_extra_fields>0) { success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY, UCL_READ_ONLY)==UCL_SUCCESS); gpu_bytes+=extra.device.row_bytes(); diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index cfd4368948..f4b23822f8 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -470,18 +470,13 @@ class Atom { inline void cast_extra_data(cpytyp *host_ptr) { if (_extra_avail==false) { double t=MPI_Wtime(); - if (_host_view) { - extra.host.view((numtyp*)host_ptr,_nall*_extra_fields,*dev); - extra.device.view(extra.host); - } else if (sizeof(numtyp)==sizeof(double)) - memcpy(extra.host.begin(),host_ptr,_nall*_extra_fields*sizeof(numtyp)); - else - #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) - #pragma omp parallel for simd schedule(static) - #elif (LAL_USE_OMP_SIMD == 1) - #pragma omp simd - #endif - for (int i=0; i<_nall*_extra_fields; i++) extra[i]=host_ptr[i]; + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i=0; i<_nall*_extra_fields; i++) + extra[i]=host_ptr[i]; _time_cast+=MPI_Wtime()-t; } } diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 1dbe1a0c40..e54d16266c 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -490,7 +490,7 @@ int DeviceT::init(Answer &ans, const bool charge, _data_in_estimate++; if (!atom.velocity() && vel) _data_in_estimate++; - if (atom.using_extra()==false && extra_fields>0) + if (atom.using_extra() && extra_fields>0) _data_in_estimate++; if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields)) return -3; From 3ae2805316f08c2de6e3612401632a4929cbdf4d Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 19 Jan 2023 07:06:29 -0500 Subject: [PATCH 150/181] add option variable to CMake build to select GPU library debug --- cmake/Modules/Packages/GPU.cmake | 27 +++++++++++++++++++++------ doc/src/Build_extras.rst | 7 ++++--- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 92fe608656..89e15e548b 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -28,6 +28,9 @@ elseif(GPU_PREC STREQUAL "SINGLE") set(GPU_PREC_SETTING "SINGLE_SINGLE") endif() +option(GPU_DEBUG "Enable debugging code of the GPU package" OFF) +mark_as_advanced(GPU_DEBUG) + file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp) file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu) @@ -153,7 +156,12 @@ if(GPU_API STREQUAL "CUDA") add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS}) target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY}) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS}) - target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS}) + target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT) + endif() if(CUDPP_OPT) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini) target_compile_definitions(gpu PRIVATE -DUSE_CUDPP) @@ -227,9 +235,12 @@ elseif(GPU_API STREQUAL "OPENCL") add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_link_libraries(gpu PRIVATE OpenCL::OpenCL) target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT) - target_compile_definitions(gpu PRIVATE -DUSE_OPENCL) - + target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT) + endif() target_link_libraries(lammps PRIVATE gpu) add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp) @@ -379,8 +390,12 @@ elseif(GPU_API STREQUAL "HIP") add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT) - target_compile_definitions(gpu PRIVATE -DUSE_HIP) + target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING}) + if(GPU_DEBUG) + target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP) + else() + target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT) + endif() target_link_libraries(gpu PRIVATE hip::host) if(HIP_USE_DEVICE_SORT) diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index d7bbe65a18..3539b14b41 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -127,10 +127,11 @@ CMake build -D GPU_API=value # value = opencl (default) or cuda or hip -D GPU_PREC=value # precision setting # value = double or mixed (default) or single - -D HIP_PATH # path to HIP installation. Must be set if GPU_API=HIP -D GPU_ARCH=value # primary GPU hardware choice for GPU_API=cuda - # value = sm_XX, see below - # default is sm_50 + # value = sm_XX (see below, default is sm_50) + -D GPU_DEBUG=value # enable debug code in the GPU package library, mostly useful for developers + # value = yes or no (default) + -D HIP_PATH=value # value = path to HIP installation. Must be set if GPU_API=HIP -D HIP_ARCH=value # primary GPU hardware choice for GPU_API=hip # value depends on selected HIP_PLATFORM # default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc From 4244d2e6cdcd16f0837edc1ef60be405c3b993e9 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 19 Jan 2023 08:56:54 -0500 Subject: [PATCH 151/181] silence compiler warnings about unused parameters and variables --- lib/gpu/geryon/hip_macros.h | 3 ++ lib/gpu/geryon/nvd_macros.h | 3 ++ lib/gpu/geryon/ocl_device.h | 5 +-- lib/gpu/geryon/ocl_macros.h | 3 ++ lib/gpu/geryon/ocl_memory.h | 6 +-- lib/gpu/geryon/ocl_texture.h | 10 ++--- lib/gpu/geryon/ocl_timer.h | 2 +- lib/gpu/geryon/ucl_copy.h | 37 +++++++--------- lib/gpu/geryon/ucl_d_vec.h | 6 +-- lib/gpu/geryon/ucl_h_vec.h | 6 +-- lib/gpu/geryon/ucl_s_obj_help.h | 22 ++++------ lib/gpu/lal_amoeba.cpp | 10 ++--- lib/gpu/lal_amoeba_ext.cpp | 4 +- lib/gpu/lal_atom.cpp | 7 ++- lib/gpu/lal_atom.h | 4 +- lib/gpu/lal_base_amoeba.cpp | 63 +++++++++++++-------------- lib/gpu/lal_base_amoeba.h | 2 +- lib/gpu/lal_base_dpd.cpp | 4 +- lib/gpu/lal_charmm_long.cpp | 22 ++++------ lib/gpu/lal_device.cpp | 4 +- lib/gpu/lal_dpd_tstat_ext.cpp | 8 ++-- lib/gpu/lal_eam.cpp | 6 +-- lib/gpu/lal_hippo.cpp | 77 ++++++++++++++++----------------- lib/gpu/lal_hippo_ext.cpp | 2 +- lib/gpu/lal_neighbor.h | 6 ++- lib/gpu/lal_sw.cpp | 2 +- lib/gpu/lal_vashishta.cpp | 4 +- 27 files changed, 164 insertions(+), 164 deletions(-) diff --git a/lib/gpu/geryon/hip_macros.h b/lib/gpu/geryon/hip_macros.h index 96313ec87e..e16caf4944 100644 --- a/lib/gpu/geryon/hip_macros.h +++ b/lib/gpu/geryon/hip_macros.h @@ -26,6 +26,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h index ac2e6cc682..19c8ff4b6c 100644 --- a/lib/gpu/geryon/nvd_macros.h +++ b/lib/gpu/geryon/nvd_macros.h @@ -33,6 +33,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 4163d40881..588c53c8fa 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -309,15 +309,14 @@ class UCL_Device { /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes - inline size_t max_pitch(const int i) { return 0; } + inline size_t max_pitch(const int) { return 0; } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ inline bool sharing_supported() { return sharing_supported(_device); } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ - inline bool sharing_supported(const int i) - { return true; } + inline bool sharing_supported(const int) { return true; } /// True if the device is a sub-device inline bool is_subdevice() diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h index 5e5a190ede..652d7795e9 100644 --- a/lib/gpu/geryon/ocl_macros.h +++ b/lib/gpu/geryon/ocl_macros.h @@ -33,6 +33,9 @@ #ifdef UCL_DEBUG #define UCL_SYNC_DEBUG #define UCL_DESTRUCT_CHECK +#define UCL_DEBUG_ARG(arg) arg +#else +#define UCL_DEBUG_ARG(arg) #endif #ifndef UCL_NO_API_CHECK diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index bfc260889a..5d8b9808bd 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o, template inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, - const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ + const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){ cl_mem_flags buffer_perm; cl_map_flags map_perm; if (kind==UCL_READ_ONLY) { @@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> { template static inline void mc(p1 &dst, const p2 &src, const size_t n, cl_command_queue &cq, const cl_bool block, - const size_t dst_offset, const size_t src_offset) { + const size_t /*dst_offset*/, const size_t src_offset) { if (src.cbegin()==dst.cbegin()) { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 1S\n"; @@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> { template static inline void mc(p1 &dst, const p2 &src, const size_t n, cl_command_queue &cq, const cl_bool block, - const size_t dst_offset, const size_t src_offset) { + const size_t dst_offset, const size_t /*src_offset*/) { if (src.cbegin()==dst.cbegin()) { if (block) ucl_sync(cq); #ifdef UCL_DBG_MEM_TRACE diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h index 8ddde5b2a3..87db3794a6 100644 --- a/lib/gpu/geryon/ocl_texture.h +++ b/lib/gpu/geryon/ocl_texture.h @@ -35,19 +35,19 @@ class UCL_Texture { UCL_Texture() {} ~UCL_Texture() {} /// Construct with a specified texture reference - inline UCL_Texture(UCL_Program &prog, const char *texture_name) { } + inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { } /// Set the texture reference for this object - inline void get_texture(UCL_Program &prog, const char *texture_name) { } + inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { } /// Bind a float array where each fetch grabs a vector of length numel template - inline void bind_float(mat_typ &vec, const unsigned numel) { } + inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { } /// Unbind the texture reference from the memory allocation inline void unbind() { } /// Make a texture reference available to kernel - inline void allow(UCL_Kernel &kernel) { } + inline void allow(UCL_Kernel & /*kernel*/) { } private: friend class UCL_Kernel; @@ -62,7 +62,7 @@ class UCL_Const { inline UCL_Const(UCL_Program &prog, const char *global_name) { get_global(prog,global_name); } /// Set the global reference for this object - inline void get_global(UCL_Program &prog, const char *global_name) { + inline void get_global(UCL_Program &prog, const char * /*global_name*/) { if (_active) { CL_DESTRUCT_CALL(clReleaseContext(_context)); CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 189871e631..8f55a91a28 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -71,7 +71,7 @@ class UCL_Timer { inline void init(UCL_Device &dev) { init(dev,dev.cq()); } /// Initialize command queue for timing - inline void init(UCL_Device &dev, command_queue &cq) { + inline void init(UCL_Device & /*dev*/, command_queue &cq) { clear(); _cq=cq; clRetainCommandQueue(_cq); diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h index c906a14f30..94b57f7a09 100644 --- a/lib/gpu/geryon/ucl_copy.h +++ b/lib/gpu/geryon/ucl_copy.h @@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> { // Should never be here template struct _host_host_copy { template - static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) { + static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) { assert(0==1); } template - static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols) { + static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) { assert(0==1); } }; @@ -470,24 +469,22 @@ template struct _ucl_cast_copy { // Neither on host or both on host template <> struct _ucl_cast_copy<1,1> { template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer, command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, + mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, - command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } }; @@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> { // Neither on host or both on host template <> struct _ucl_cast_copy<0,0> { template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer, command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, + mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, - mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t /*cols*/, mat3 & /*cast_buffer*/) { assert(0==1); } template - static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, - const size_t cols, mat3 &cast_buffer, - command_queue &cq) { + static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, + const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) { assert(0==1); } }; diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h index 9158e145b3..5e281fef07 100644 --- a/lib/gpu/geryon/ucl_d_vec.h +++ b/lib/gpu/geryon/ucl_d_vec.h @@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) { + inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif @@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat { * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template - inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, - const size_t cols) { + inline void view_offset(const size_t offset,ucl_type &input, + const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index 2f49f9f633..9f734ac40c 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template - inline void view(ucl_type &input, const size_t rows, const size_t cols) { + inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif @@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device pointer on the host is not supported **/ template - inline void view(ptr_type *input, const size_t rows, const size_t cols, + inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols, UCL_Device &dev) { #ifdef UCL_DEBUG assert(rows==1); @@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat { * allocating container when using CUDA APIs * - Viewing a device container on the host is not supported **/ template - inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, + inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows), const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h index a10f3cdb3f..9bc2c40fe2 100644 --- a/lib/gpu/geryon/ucl_s_obj_help.h +++ b/lib/gpu/geryon/ucl_s_obj_help.h @@ -27,7 +27,7 @@ template struct _ucl_s_obj_help; // -- Can potentially use same memory if shared by accelerator template <> struct _ucl_s_obj_help<1> { template - static inline int alloc(t1 &host, t2 &device, t3 &_buffer, + static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/, const int cols, UCL_Device &acc, const enum UCL_MEMOPT kind1, const enum UCL_MEMOPT kind2) { @@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> { } template - static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) { + static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,async); } template - static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,cq); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, - const bool async) { + static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer, - command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,cols,cq); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, - t3 &buffer, const bool async) { + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) { ucl_copy(dst,src,rows,cols,async); } template - static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, - t3 &buffer, command_queue &cq) { + static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) { ucl_copy(dst,src,rows,cols,cq); } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) { if (device.kind()==UCL_VIEW) { device.view(host); return UCL_SUCCESS; @@ -353,7 +349,7 @@ template struct _ucl_s_obj_help { } template - static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) { + static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) { int err=buff.resize(cols); if (err!=UCL_SUCCESS) return err; diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 38aa2bde27..5e19997913 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -48,10 +48,10 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int *host_amtype2class, const double *host_special_hal, - const double *host_special_repel, - const double *host_special_disp, + const double * /*host_special_repel*/, + const double * /*host_special_disp*/, const double *host_special_mpole, - const double *host_special_polar_wscale, + const double * /*host_special_polar_wscale*/, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, @@ -188,7 +188,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { // Launch the real-space permanent field kernel // --------------------------------------------------------------------------- template -int AmoebaT::udirect2b(const int eflag, const int vflag) { +int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) { int ainum=this->ans->inum(); if (ainum == 0) return 0; @@ -230,7 +230,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { // Launch the real-space induced field kernel, returning field and fieldp // --------------------------------------------------------------------------- template -int AmoebaT::umutual2b(const int eflag, const int vflag) { +int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) { int ainum=this->ans->inum(); if (ainum == 0) return 0; diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 5e4d48a2da..fe3d4a26d8 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -119,8 +119,8 @@ void amoeba_gpu_clear() { int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, + int *host_amgroup, double **host_rpole, double ** /*host_uind*/, + double ** /*host_uinp*/, double * /*host_pval*/, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp index bf27334578..853fdf216d 100644 --- a/lib/gpu/lal_atom.cpp +++ b/lib/gpu/lal_atom.cpp @@ -403,9 +403,14 @@ double AtomT::host_memory_usage() const { return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom); } +#ifdef USE_CUDPP +#define USE_CUDPP_ARG(arg) arg +#else +#define USE_CUDPP_ARG(arg) +#endif // Sort arrays for neighbor list calculation template -void AtomT::sort_neighbor(const int num_atoms) { +void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) { #ifdef USE_CUDPP CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin(), diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index f4b23822f8..4b29d76cb1 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -327,7 +327,7 @@ class Atom { /// Copy positions and types to device asynchronously /** Copies nall() elements **/ - inline void add_x_data(double **host_ptr, int *host_type) { + inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST @@ -441,7 +441,7 @@ class Atom { /// Copy velocities and tags to device asynchronously /** Copies nall() elements **/ - inline void add_v_data(double **host_ptr, tagint *host_tag) { + inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) { time_vel.start(); if (_v_avail==false) { #ifdef GPU_CAST diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index a1d4a00c2c..99e3a6a77e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -288,7 +288,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **&ilist, int **&jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd) { + bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) { acc_timers(); if (eatom) _eflag=2; else if (eflag_in) _eflag=1; @@ -368,20 +368,21 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall // this is the first part in a time step done on the GPU for AMOEBA for now // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double felec, - const double off2_mpole, double *host_q, - double *boxlo, double *prd, void **tep_ptr) { +void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double */*host_pval*/, double * /*sublo*/, + double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool /*eflag_in*/, const bool /*vflag_in*/, + const bool /*eatom*/, const bool /*vatom*/, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double aewald, const double felec, + const double off2_mpole, double * /*host_q*/, + double * /*boxlo*/, double * /*prd*/, void **tep_ptr) { // ------------------- Resize _tep array ------------------------ if (inum_full>_max_tep_size) { @@ -393,7 +394,7 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, _off2_mpole = off2_mpole; _felec = felec; _aewald = aewald; - const int red_blocks=multipole_real(_eflag,_vflag); + multipole_real(_eflag,_vflag); // leave the answers (forces, energies and virial) on the device, // only copy them back in the last kernel (polar_real) @@ -424,7 +425,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double // specify the correct cutoff and alpha values _off2_polar = off2_polar; _aewald = aewald; - const int red_blocks=udirect2b(_eflag,_vflag); + udirect2b(_eflag,_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -436,10 +437,10 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double // of the induced field // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - const double aewald, const double off2_polar, - void** fieldp_ptr) { +void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const double aewald, const double off2_polar, + void** /*fieldp_ptr*/) { // only copy the necessary data arrays that are updated over the iterations // use nullptr for the other arrays that are already copied from host to device cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr); @@ -449,7 +450,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double _off2_polar = off2_polar; _aewald = aewald; // launch the kernel - const int red_blocks=umutual2b(_eflag,_vflag); + umutual2b(_eflag,_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) // NOTE: move this step to update_fieldp() to delay device-host transfer @@ -492,7 +493,7 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); } else { - if (_thetai1.cols()<_max_thetai_size*bsorder) { + if ((int)_thetai1.cols()<_max_thetai_size*bsorder) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); _thetai1.resize(_max_thetai_size*bsorder); _thetai2.resize(_max_thetai_size*bsorder); @@ -573,7 +574,7 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, int numel = _num_grid_points; if (_cgrid_brick.cols() == 0) { _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); - } else if (numel > _cgrid_brick.cols()) { + } else if (numel > (int)_cgrid_brick.cols()) { _cgrid_brick.resize(numel); } } @@ -611,7 +612,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, #endif // launch the kernel with its execution configuration (see below) - const int red_blocks = fphi_uind(); + fphi_uind(); // copy data from device to host asynchronously _fdip_phi1.update_host(_max_thetai_size*10, true); @@ -682,7 +683,7 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi _cgrid_brick.update_device(_num_grid_points, false); _felec = felec; - const int red_blocks = fphi_mpole(); + fphi_mpole(); _fdip_sum_phi.update_host(_max_thetai_size*20); @@ -698,9 +699,6 @@ int BaseAmoebaT::fphi_mpole() { if (ainum == 0) return 0; - int _nall=atom->nall(); - int nbor_pitch=nbor->nbor_pitch(); - // Compute the block size and grid size to keep all cores busy const int BX=block_size(); @@ -771,7 +769,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const { // --------------------------------------------------------------------------- template -void BaseAmoebaT::setup_fft(const int numel, const int element_type) +void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/) { // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) } @@ -781,7 +779,8 @@ void BaseAmoebaT::setup_fft(const int numel, const int element_type) // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode) +void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/, + const int /*numel*/, const int /*mode*/) { // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT) #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP) @@ -940,7 +939,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) if (dev.has_subgroup_support()) { - size_t mx_subgroup_sz = k_polar.max_subgroup_size(_block_size); + int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size); if (_threads_per_atom > mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz; device->set_simd_size(mx_subgroup_sz); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a7f98fa5be..0eaaafeb1e 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -280,7 +280,7 @@ class BaseAmoeba { UCL_Kernel k_fphi_uind, k_fphi_mpole; UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } - inline void set_kernel(const int eflag, const int vflag) {} + inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {} // --------------------------- TEXTURES ----------------------------- UCL_Texture pos_tex; diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index c084c02ff0..e103699d40 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -196,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall, const double cpu_time, bool &success, tagint *tag, double **host_v, const double dtinvsqrt, const int seed, const int timestep, - const int nlocal, double *boxlo, double *prd) { + const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -261,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full, const double cpu_time, bool &success, double **host_v, const double dtinvsqrt, const int seed, const int timestep, - double *boxlo, double *prd) { + double * /*boxlo*/, double * /*prd*/) { acc_timers(); int eflag, vflag; if (eatom) eflag=2; diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp index 8008b1fbb3..0d01d70fb1 100644 --- a/lib/gpu/lal_charmm_long.cpp +++ b/lib/gpu/lal_charmm_long.cpp @@ -44,19 +44,15 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const { } template -int CHARMMLongT::init(const int ntypes, - double host_cut_bothsq, double **host_lj1, - double **host_lj2, double **host_lj3, - double **host_lj4, double **host_offset, - double *host_special_lj, const int nlocal, - const int nall, const int max_nbors, - const int maxspecial, const double cell_size, - const double gpu_split, FILE *_screen, - double host_cut_ljsq, const double host_cut_coulsq, - double *host_special_coul, const double qqrd2e, - const double g_ewald, const double cut_lj_innersq, - const double denom_lj, double **epsilon, - double **sigma, const bool mix_arithmetic) { +int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double ** /*host_offset*/, double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *_screen, + double host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, const double g_ewald, + const double cut_lj_innersq, const double denom_lj, double **epsilon, + double **sigma, const bool mix_arithmetic) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, _screen,charmm_long,"k_charmm_long"); diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index e54d16266c..dd3ce15827 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -52,7 +52,7 @@ DeviceT::~Device() { } template -int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, +int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu, const int first_gpu_id, const int gpu_mode, const double p_split, const int t_per_atom, const double user_cell_size, char *ocl_args, @@ -528,7 +528,7 @@ int DeviceT::init(Answer &ans, const int nlocal, template int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, - const int host_nlocal, const int nall, + const int host_nlocal, const int /*nall*/, const int maxspecial, const int gpu_host, const int max_nbors, const double cutoff, const bool pre_cut, const int threads_per_atom, diff --git a/lib/gpu/lal_dpd_tstat_ext.cpp b/lib/gpu/lal_dpd_tstat_ext.cpp index 2b63bf62e7..78a1bf2d9d 100644 --- a/lib/gpu/lal_dpd_tstat_ext.cpp +++ b/lib/gpu/lal_dpd_tstat_ext.cpp @@ -28,10 +28,10 @@ static DPD DPDTMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0, - double **host_gamma, double **host_sigma, double **host_cut, - double *special_lj, const int inum, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen) { + double **host_gamma, double **host_sigma, double **host_cut, + double *special_lj, const int inum, + const int nall, const int /*max_nbors*/, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { DPDTMF.clear(); gpu_mode=DPDTMF.device->gpu_mode(); double gpu_split=DPDTMF.device->particle_split(); diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index 2c0d63f7bf..b7bc7b958a 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, + const bool /*eatom*/, const bool /*vatom*/, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { this->acc_timers(); @@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag_in, - const bool vflag_in, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, + const bool vflag_in, const bool /*eatom*/, + const bool /*vatom*/, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { this->acc_timers(); diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index f8ab436ad0..24ffae8de2 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -176,19 +176,19 @@ double HippoT::host_memory_usage() const { // Compute the repulsion term, returning tep // --------------------------------------------------------------------------- template -void HippoT::compute_repulsion(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, +void HippoT::compute_repulsion(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_repulse, - double *host_q, double *boxlo, double *prd, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, + const double /*aewald*/, const double off2_repulse, + double * /*host_q*/, double * /*boxlo*/, double * /*prd*/, double cut2, double c0, double c1, double c2, double c3, double c4, double c5, void **tep_ptr) { this->acc_timers(); @@ -223,7 +223,7 @@ void HippoT::compute_repulsion(const int ago, const int inum_full, _c3 = c3; _c4 = c4; _c5 = c5; - const int red_blocks=repulsion(this->_eflag,this->_vflag); + repulsion(this->_eflag,this->_vflag); // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); @@ -287,7 +287,7 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, this->_off2_disp = off2_disp; this->_aewald = aewald; - const int red_blocks=dispersion_real(this->_eflag,this->_vflag); + dispersion_real(this->_eflag,this->_vflag); // only copy them back if this is the last kernel // otherwise, commenting out these two lines to leave the answers @@ -341,21 +341,21 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { // Compute the multipole real-space term, returning tep // --------------------------------------------------------------------------- template -void HippoT::compute_multipole_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double* host_pval, double *sublo, - double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, +void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full, + const int /*nall*/, double ** /*host_x*/, + int * /*host_type*/, int * /*host_amtype*/, + int * /*host_amgroup*/, double ** /*host_rpole*/, + double* host_pval, double * /*sublo*/, + double * /*subhi*/, tagint * /*tag*/, + int ** /*nspecial*/, tagint ** /*special*/, + int * /*nspecial15*/, tagint ** /*special15*/, + const bool /*eflag_in*/, const bool /*vflag_in*/, + const bool /*eatom*/, const bool /*vatom*/, + int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/, + const double /*cpu_time*/, bool & /*success*/, const double aewald, const double felec, - const double off2_mpole, double *host_q, - double *boxlo, double *prd, void **tep_ptr) { + const double off2_mpole, double * /*host_q*/, + double * /*boxlo*/, double * /*prd*/, void **tep_ptr) { // cast necessary data arrays from host to device @@ -373,7 +373,7 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full, this->_off2_mpole = off2_mpole; this->_felec = felec; this->_aewald = aewald; - const int red_blocks=multipole_real(this->_eflag,this->_vflag); + multipole_real(this->_eflag,this->_vflag); // copy tep from device to host this->_tep.update_host(this->_max_tep_size*4,false); @@ -424,7 +424,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) { // returning field and fieldp // --------------------------------------------------------------------------- template -void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, +void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, double **host_uind, double **host_uinp, double* host_pval, const double aewald, const double off2_polar, void** fieldp_ptr) { @@ -438,7 +438,7 @@ void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **hos this->_off2_polar = off2_polar; this->_aewald = aewald; - const int red_blocks=udirect2b(this->_eflag,this->_vflag); + udirect2b(this->_eflag,this->_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) @@ -449,7 +449,7 @@ void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **hos // Launch the real-space permanent field kernel // --------------------------------------------------------------------------- template -int HippoT::udirect2b(const int eflag, const int vflag) { +int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) { int ainum=this->ans->inum(); if (ainum == 0) return 0; @@ -493,10 +493,9 @@ int HippoT::udirect2b(const int eflag, const int vflag) { // returning field and fieldp // --------------------------------------------------------------------------- template -void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - const double aewald, const double off2_polar, - void** fieldp_ptr) { +void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, + const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) { // cast necessary data arrays from host to device @@ -505,7 +504,7 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos this->_off2_polar = off2_polar; this->_aewald = aewald; - const int red_blocks=umutual2b(this->_eflag,this->_vflag); + umutual2b(this->_eflag,this->_vflag); // copy field and fieldp from device to host (_fieldp store both arrays, one after another) // NOTE: move this step to update_fieldp() to delay device-host transfer @@ -517,7 +516,7 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos // Launch the real-space induced field kernel // --------------------------------------------------------------------------- template -int HippoT::umutual2b(const int eflag, const int vflag) { +int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) { int ainum=this->ans->inum(); if (ainum == 0) return 0; @@ -557,8 +556,8 @@ int HippoT::umutual2b(const int eflag, const int vflag) { // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- template -void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, +void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/, + double **host_uind, double **host_uinp, double * /*host_pval*/, const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, const double aewald, const double felec, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 77450bf7b1..b5ac42744a 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -123,7 +123,7 @@ void hippo_gpu_clear() { int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, + double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 9061ce5150..24aaf6aeba 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -293,15 +293,17 @@ class Neighbor { #endif int _simd_size; + #ifdef LAL_USE_OLD_NEIGHBOR inline void set_nbor_block_size(const int mn) { - #ifdef LAL_USE_OLD_NEIGHBOR int desired=mn/(2*_simd_size); desired*=_simd_size; if (desired<_simd_size) desired=_simd_size; else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build; _block_nbor_build=desired; - #endif } + #else + inline void set_nbor_block_size(const int) {} + #endif }; } diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index eb42c710cc..9687a0352d 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -150,7 +150,7 @@ double SWT::host_memory_usage() const { // --------------------------------------------------------------------------- template int SWT::loop(const int eflag, const int vflag, const int evatom, - bool &success) { + bool & /*success*/) { const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp index c343de3f55..fcc9d00ab0 100644 --- a/lib/gpu/lal_vashishta.cpp +++ b/lib/gpu/lal_vashishta.cpp @@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i const double* costheta, const double* bigb, const double* big2b, const double* bigc) { - int success; + int success=0; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,vashishta,"k_vashishta","k_vashishta_three_center", "k_vashishta_three_end","k_vashishta_short_nbor"); @@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const { // --------------------------------------------------------------------------- template int VashishtaT::loop(const int eflag, const int vflag, const int evatom, - bool &success) { + bool & /*success*/) { const int nbor_pitch=this->nbor->nbor_pitch(); // build the short neighbor list From 03ab42fd52fd104edcf8b973fb1ece54733cc9ac Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 19 Jan 2023 08:57:24 -0500 Subject: [PATCH 152/181] correct calling sequence for matching argument types --- lib/gpu/lal_base_amoeba.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 99e3a6a77e..841d968e56 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -685,7 +685,7 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi _felec = felec; fphi_mpole(); - _fdip_sum_phi.update_host(_max_thetai_size*20); + _fdip_sum_phi.update_host(_max_thetai_size*20, false); *host_fphi = _fdip_sum_phi.host.begin(); } From 8eb722a32ad3d551a1c64e88fc768f66af771a35 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 19 Jan 2023 13:22:27 -0600 Subject: [PATCH 153/181] Enforced synchronous host-device transfers for cgrid_brick and fdip arrays --- lib/gpu/lal_base_amoeba.cpp | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 841d968e56..21d9975b28 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -591,10 +591,6 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, void **host_fdip_phi2, void **host_fdip_sum_phi) { - // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly - UCL_H_Vec hdummy; - hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); - int n = 0; for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++) @@ -605,7 +601,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, _cgrid_brick[n] = v; n++; } - _cgrid_brick.update_device(_num_grid_points, true); + _cgrid_brick.update_device(_num_grid_points, false); #ifdef ASYNC_DEVICE_COPY ucl_device->sync(); @@ -614,10 +610,10 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, // launch the kernel with its execution configuration (see below) fphi_uind(); - // copy data from device to host asynchronously - _fdip_phi1.update_host(_max_thetai_size*10, true); - _fdip_phi2.update_host(_max_thetai_size*10, true); - _fdip_sum_phi.update_host(_max_thetai_size*20, true); + // copy data from device to host + _fdip_phi1.update_host(_max_thetai_size*10, false); + _fdip_phi2.update_host(_max_thetai_size*10, false); + _fdip_sum_phi.update_host(_max_thetai_size*20, false); // return the pointers to the host-side arrays *host_fdip_phi1 = _fdip_phi1.host.begin(); @@ -638,13 +634,7 @@ int BaseAmoebaT::fphi_uind() { const int BX=block_size(); const int GX=static_cast(ceil(static_cast(ainum)/BX)); - /* - const int cus = this->device->gpu->cus(); - while (GX < cus && GX > 1) { - BX /= 2; - GX=static_cast(ceil(static_cast(ainum)/BX)); - } - */ + time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); @@ -666,10 +656,6 @@ int BaseAmoebaT::fphi_uind() { template void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) { - // TODO: grid brick[k][j][i] is a scalar - UCL_H_Vec hdummy; - hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); - int n = 0; for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++) From bdf8dd4e5415ff0dcd8bcf7a704d64726ff860df Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 20 Jan 2023 00:32:31 -0500 Subject: [PATCH 154/181] serial version --- src/REAXFF/fix_reaxff_species.cpp | 53 +++++++++++++++++++++++++++++-- src/REAXFF/fix_reaxff_species.h | 1 + 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index 65a2e6d8ce..ceaf983893 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -38,6 +38,7 @@ #include #include +#include using namespace LAMMPS_NS; using namespace FixConst; @@ -145,6 +146,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : ele = filepos = filedel = nullptr; eleflag = posflag = padflag = 0; delflag = specieslistflag = masslimitflag = 0; + delete_Nlimit = delete_Nsteps = 0; singlepos_opened = multipos_opened = del_opened = 0; multipos = 0; @@ -221,7 +223,12 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : } else error->all(FLERR, "Unknown fix reaxff/species delete option: {}", arg[iarg]); - + // rate limit when deleting molecules + } else if (strcmp(arg[iarg], "delete_rate_limit") == 0) { + if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species delete_rate_limit", error); + delete_Nlimit = utils::numeric(FLERR, arg[iarg+1], false, lmp); + delete_Nsteps = utils::numeric(FLERR, arg[iarg+2], false, lmp); + iarg += 3; // position of molecules } else if (strcmp(arg[iarg], "position") == 0) { if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species position", error); @@ -260,6 +267,14 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : if (delflag && specieslistflag && masslimitflag) error->all(FLERR, "Incompatible combination fix reaxff/species command options"); + if (delete_Nlimit > 0) { + memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount"); + + for (int i = 0; i < delete_Nsteps; i++) + delete_Tcount[i] = -1; + delete_Tcount[0] = 0; + } + vector_nmole = 0; vector_nspec = 0; } @@ -279,6 +294,7 @@ FixReaxFFSpecies::~FixReaxFFSpecies() memory->destroy(Mol2Spec); memory->destroy(MolType); memory->destroy(MolName); + memory->destroy(delete_Tcount); delete[] filepos; delete[] filedel; @@ -375,6 +391,11 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/) // point to fix_ave_atom f_SPECBOND->end_of_step(); + // push back delete_Tcount on every step + if (delete_Nlimit > 0) + for (int i = delete_Nsteps-1; i > 0; i--) + delete_Tcount[i] = delete_Tcount[i-1]; + if (ntimestep != nvalid) return; nlocal = atom->nlocal; @@ -826,6 +847,15 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec) void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) { + int ndeletions; + int headroom = -1; + if (delete_Nlimit > 0) { + if (delete_Tcount[delete_Nsteps-1] == -1) return; + ndeletions = delete_Tcount[0] - delete_Tcount[delete_Nsteps-1]; + headroom = MAX(0, delete_Nlimit - ndeletions); + if (headroom == 0) return; + } + int i, j, m, n, itype, cid; int ndel, ndelone, count, count_tmp; int *Nameall; @@ -856,7 +886,20 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) int *marklist; memory->create(marklist, nlocal, "reaxff/species:marklist"); - for (m = 1; m <= Nmole; m++) { + std::random_device rnd; + std::minstd_rand park_rng(rnd()); + int *molrange; + memory->create(molrange,Nmole,"reaxff/species:molrange"); + for (m = 0; m < Nmole; m++) + molrange[m] = m + 1; + // shuffle index when using rate_limit, in case order is biased + if (delete_Nlimit > 0) + std::shuffle(&molrange[0],&molrange[Nmole], park_rng); + + int this_delete_Tcount = 0; + for (int mm = 0; mm < Nmole; mm++) { + if (this_delete_Tcount == headroom) break; + m = molrange[mm]; localmass = totalmass = count = nmarklist = 0; for (n = 0; n < ntypes; n++) Name[n] = 0; @@ -896,6 +939,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) // find corresponding moltype if (totalmass > massmin && totalmass < massmax) { + this_delete_Tcount++; for (j = 0; j < nmarklist; j++) { mark[marklist[j]] = 1; deletecount[Mol2Spec[m - 1]] += 1.0 / (double) count; @@ -905,6 +949,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) if (count > 0) { for (i = 0; i < ndelspec; i++) { if (del_species[i] == species_str) { + this_delete_Tcount++; for (j = 0; j < nmarklist; j++) { mark[marklist[j]] = 1; deletecount[i] += 1.0 / (double) count; @@ -976,6 +1021,9 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) } } + if (delete_Nlimit) + delete_Tcount[0] += this_delete_Tcount; + if (ndel && (atom->map_style != Atom::MAP_NONE)) { atom->nghost = 0; atom->map_init(); @@ -988,6 +1036,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) memory->destroy(marklist); memory->destroy(mark); memory->destroy(deletecount); + memory->destroy(molrange); } /* ---------------------------------------------------------------------- */ diff --git a/src/REAXFF/fix_reaxff_species.h b/src/REAXFF/fix_reaxff_species.h index 65eeae4c60..329e17145b 100644 --- a/src/REAXFF/fix_reaxff_species.h +++ b/src/REAXFF/fix_reaxff_species.h @@ -60,6 +60,7 @@ class FixReaxFFSpecies : public Fix { FILE *fp, *pos, *fdel; int eleflag, posflag, multipos, padflag, setupflag; int delflag, specieslistflag, masslimitflag; + int delete_Nlimit, delete_Nsteps, *delete_Tcount; double massmin, massmax; int singlepos_opened, multipos_opened, del_opened; char *ele, **eletype, *filepos, *filedel; From 096e0a14f009fcfa05d777daabb66e2bd7b89d8f Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 20 Jan 2023 00:38:06 -0500 Subject: [PATCH 155/181] off-by-one fix --- src/REAXFF/fix_reaxff_species.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index ceaf983893..6ad7b50226 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -391,12 +391,13 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/) // point to fix_ave_atom f_SPECBOND->end_of_step(); - // push back delete_Tcount on every step - if (delete_Nlimit > 0) - for (int i = delete_Nsteps-1; i > 0; i--) - delete_Tcount[i] = delete_Tcount[i-1]; - - if (ntimestep != nvalid) return; + if (ntimestep != nvalid) { + // push back delete_Tcount on every step + if (delete_Nlimit > 0) + for (int i = delete_Nsteps-1; i > 0; i--) + delete_Tcount[i] = delete_Tcount[i-1]; + return; + } nlocal = atom->nlocal; @@ -1021,8 +1022,13 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) } } - if (delete_Nlimit) + + // push back delete_Tcount on every step + if (delete_Nlimit > 0) { + for (i = delete_Nsteps-1; i > 0; i--) + delete_Tcount[i] = delete_Tcount[i-1]; delete_Tcount[0] += this_delete_Tcount; + } if (ndel && (atom->map_style != Atom::MAP_NONE)) { atom->nghost = 0; From bebf79ec92251f04d36b40dfce0e7f627d74f50a Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 20 Jan 2023 00:41:56 -0500 Subject: [PATCH 156/181] reaxff species delete_rate_limit keyword docs --- doc/src/fix_reaxff_species.rst | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/src/fix_reaxff_species.rst b/doc/src/fix_reaxff_species.rst index 11f0e7b7e7..dcb9cfa3bc 100644 --- a/doc/src/fix_reaxff_species.rst +++ b/doc/src/fix_reaxff_species.rst @@ -39,6 +39,9 @@ Syntax *masslimit* value = massmin massmax massmin = minimum molecular weight of species to delete massmax = maximum molecular weight of species to delete + *delete_rate_limit* value = Nlimit Nsteps + Nlimit = maximum number of deletions allowed to occur within interval + Nsteps = the interval (number of timesteps) over which to count deletions Examples """""""" @@ -140,7 +143,13 @@ When using the *masslimit* keyword, each line of the *filedel* file contains the timestep on which deletions occurs, followed by how many of each species are deleted (with quantities preceding chemical formulae). The *specieslist* and *masslimit* keywords cannot both be -used in the same *reaxff/species* fix. +used in the same *reaxff/species* fix. The *delete_rate_limit* +keyword can enforce an upper limit on the overall rate of molecule +deletion. The number of deletion occurrences is limited to Nlimit +within an interval of Nsteps timesteps. When using the +*delete_rate_limit* keyword, no deletions are permitted to occur +within the first Nsteps timesteps of the first run (after reading a +either a data or restart file). ---------- From 617d70dd1ca6b466ef784dadcb513286e0171d89 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 20 Jan 2023 14:19:16 -0600 Subject: [PATCH 157/181] Replaced MPI_Wtime() with platform::walltime(), put the low-level timing breakdown inside #if DEBUG_AMOEBA --- src/AMOEBA/amoeba_convolution.cpp | 16 ++++++++-------- src/AMOEBA/amoeba_induce.cpp | 26 ++++++++++++-------------- src/AMOEBA/amoeba_multipole.cpp | 6 +++--- src/AMOEBA/amoeba_polar.cpp | 6 +++--- src/AMOEBA/pair_amoeba.cpp | 26 ++++++++++++++------------ src/GPU/amoeba_convolution_gpu.cpp | 8 ++++---- src/GPU/pair_amoeba_gpu.cpp | 23 +++++++++++------------ src/GPU/pair_hippo_gpu.cpp | 22 ++++++++++------------ 8 files changed, 65 insertions(+), 68 deletions(-) diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index 028eb7717b..609df1184e 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -329,12 +329,12 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d() double time0,time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); - time1 = MPI_Wtime(); + time1 = platform::walltime(); if (SCALE) { double scale = 1.0/nfft_global; @@ -394,12 +394,12 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d() double time0,time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); - time1 = MPI_Wtime(); + time1 = platform::walltime(); if (SCALE) { double scale = 1.0/nfft_global; @@ -444,10 +444,10 @@ void *AmoebaConvolution::post_convolution_3d() double time0,time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); fft2->compute(cfft,cfft,FFT3d::BACKWARD); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_fft += time1 - time0; @@ -495,11 +495,11 @@ void *AmoebaConvolution::post_convolution_4d() double time0,time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); fft2->compute(cfft,cfft,FFT3d::BACKWARD); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_fft += time1 - time0; diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 031173060c..7ff9fe7121 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -532,8 +532,6 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) int i,j; double term; - double time0,time1,time2; - // zero field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; @@ -546,18 +544,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) } } + double time0, time1, time2; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); // get the real space portion of the mutual field if (polar_rspace_flag) umutual2b(field,fieldp); - time1 = MPI_Wtime(); + time1 = platform::walltime(); // get the reciprocal space part of the mutual field if (polar_kspace_flag) umutual1(field,fieldp); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // add the self-energy portion of the mutual field @@ -781,8 +780,6 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) int i,j; double term; - double time0,time1,time2; - // zero out field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; @@ -797,11 +794,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the reciprocal space part of the permanent field + double time0, time1, time2; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); if (polar_kspace_flag) udirect1(field); - time1 = MPI_Wtime(); + time1 = platform::walltime(); for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { @@ -812,7 +810,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the real space portion of the permanent field if (polar_rspace_flag) udirect2b(field,fieldp); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // get the self-energy portion of the permanent field @@ -873,11 +871,11 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // map 2 values to grid MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); grid_uind(fuind,fuinp,gridpre); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_grid_uind += (time1 - time0); // pre-convolution operations including forward FFT @@ -918,11 +916,11 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // get potential MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_fphi_uind += (time1 - time0); // store fractional reciprocal potentials for OPT method diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index 7269128080..f302194193 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -81,17 +81,17 @@ void PairAmoeba::multipole() felec = electric / am_dielectric; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); // compute the real space part of the Ewald summation if (mpole_rspace_flag) multipole_real(); - time1 = MPI_Wtime(); + time1 = platform::walltime(); // compute the reciprocal space part of the Ewald summation if (mpole_kspace_flag) multipole_kspace(); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // compute the Ewald self-energy term over all the atoms diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index e0e8ecc1d9..e2b85ed22c 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -79,15 +79,15 @@ void PairAmoeba::polar() // compute the real space part of the dipole interactions MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); if (polar_rspace_flag) polar_real(); - time1 = MPI_Wtime(); + time1 = platform::walltime(); // compute the reciprocal space part of dipole interactions if (polar_kspace_flag) polar_kspace(); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // compute the Ewald self-energy torque and virial terms diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index 677bc48344..a1b288348a 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -47,6 +47,7 @@ enum{MUTUAL,OPT,TCG,DIRECT}; enum{GEAR,ASPC,LSQR}; #define DELTASTACK 16 +#define DEBUG_AMOEBA 0 /* ---------------------------------------------------------------------- */ @@ -371,7 +372,7 @@ void PairAmoeba::compute(int eflag, int vflag) double time0,time1,time2,time3,time4,time5,time6,time7,time8; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); // if reneighboring step: // augment neighbor list to include 1-5 neighbor flags @@ -427,8 +428,7 @@ void PairAmoeba::compute(int eflag, int vflag) comm->forward_comm(this); if (amoeba) pbc_xred(); - - time1 = MPI_Wtime(); + time1 = platform::walltime(); // ---------------------------------------- // compute components of force field @@ -437,22 +437,22 @@ void PairAmoeba::compute(int eflag, int vflag) // buffered 14-7 Vdwl, pairwise if (amoeba && hal_flag) hal(); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // Pauli repulsion, pairwise if (!amoeba && repulse_flag) repulsion(); - time3 = MPI_Wtime(); + time3 = platform::walltime(); // Ewald dispersion, pairwise and long range if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion(); - time4 = MPI_Wtime(); + time4 = platform::walltime(); // multipole, pairwise and long range if (mpole_rspace_flag || mpole_kspace_flag) multipole(); - time5 = MPI_Wtime(); + time5 = platform::walltime(); // induced dipoles, interative CG relaxation // communicate induce() output values needed by ghost atoms @@ -462,17 +462,17 @@ void PairAmoeba::compute(int eflag, int vflag) cfstyle = INDUCE; comm->forward_comm(this); } - time6 = MPI_Wtime(); + time6 = platform::walltime(); // dipoles, pairwise and long range if (polar_rspace_flag || polar_kspace_flag) polar(); - time7 = MPI_Wtime(); + time7 = platform::walltime(); // charge transfer, pairwise if (!amoeba && qxfer_flag) charge_transfer(); - time8 = MPI_Wtime(); + time8 = platform::walltime(); // store energy components for output by compute pair command @@ -535,8 +535,8 @@ void PairAmoeba::finish() MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_qxfer = ave/comm->nprocs; + #if DEBUG_AMOEBA // real-space/kspace breakdown - MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_mpole_rspace = ave/comm->nprocs; @@ -571,6 +571,7 @@ void PairAmoeba::finish() if (ic_kspace) time_mutual_fft = ic_kspace->time_fft; MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_mutual_fft = ave/comm->nprocs; + #endif // DEBUG_AMOEBA double time_total = (time_init + time_hal + time_repulse + time_disp + time_mpole + time_induce + time_polar + time_qxfer) / 100.0; @@ -591,6 +592,7 @@ void PairAmoeba::finish() utils::logmesg(lmp," Qxfer time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total); utils::logmesg(lmp," Total time: {:.6g}\n",time_total * 100.0); + #if DEBUG_AMOEBA double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace; double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace; @@ -607,7 +609,7 @@ void PairAmoeba::finish() utils::logmesg(lmp," - FFT : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); utils::logmesg(lmp," - Interp : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); - + #endif } } diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp index fd4aece6c8..908c9e409c 100644 --- a/src/GPU/amoeba_convolution_gpu.cpp +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -102,7 +102,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() double time0,time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); // perform forward FFT @@ -112,7 +112,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() fft1->compute(cfft,cfft,FFT3d::FORWARD); #endif - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_fft += time1 - time0; @@ -146,11 +146,11 @@ void *AmoebaConvolutionGPU::post_convolution_4d() double time0,time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); fft2->compute(cfft,cfft,FFT3d::BACKWARD); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_fft += time1 - time0; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 5bc2b3a48c..34605725a5 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -919,11 +919,8 @@ void PairAmoebaGPU::udirect2b_cpu() void PairAmoebaGPU::ufield0c(double **field, double **fieldp) { - //int i,j; double term; - double time0,time1,time2; - // zero field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; @@ -934,16 +931,18 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) // get the real space portion of the mutual field first + double time0, time1, time2; + MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); if (polar_rspace_flag) umutual2b(field,fieldp); - time1 = MPI_Wtime(); + time1 = platform::walltime(); // get the reciprocal space part of the mutual field if (polar_kspace_flag) umutual1(field,fieldp); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // add the self-energy portion of the mutual field @@ -1049,20 +1048,19 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; } - double time0, time1; - // gridpre = my portion of 4d grid in brick decomp w/ ghost values double ****gridpre = (double ****) ic_kspace->zero(); // map 2 values to grid + double time0, time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); grid_uind(fuind,fuinp,gridpre); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_grid_uind += (time1 - time0); // pre-convolution operations including forward FFT @@ -1102,11 +1100,12 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // get potential - time0 = MPI_Wtime(); + MPI_Barrier(world); + time0 = platform::walltime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_fphi_uind += (time1 - time0); // store fractional reciprocal potentials for OPT method diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 1f0f3e820a..3049799433 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -1016,8 +1016,6 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) { double term; - double time0,time1,time2; - // zero field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; @@ -1028,16 +1026,17 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) // get the real space portion of the mutual field first + double time0, time1, time2; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); if (polar_rspace_flag) umutual2b(field,fieldp); - time1 = MPI_Wtime(); + time1 = platform::walltime(); // get the reciprocal space part of the mutual field if (polar_kspace_flag) umutual1(field,fieldp); - time2 = MPI_Wtime(); + time2 = platform::walltime(); // add the self-energy portion of the mutual field @@ -1123,21 +1122,19 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; } - double time0, time1; - // gridpre = my portion of 4d grid in brick decomp w/ ghost values double ****gridpre = (double ****) ic_kspace->zero(); // map 2 values to grid - + double time0, time1; MPI_Barrier(world); - time0 = MPI_Wtime(); + time0 = platform::walltime(); grid_uind(fuind,fuinp,gridpre); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_grid_uind += (time1 - time0); // pre-convolution operations including forward FFT @@ -1177,11 +1174,12 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) // get potential - time0 = MPI_Wtime(); + MPI_Barrier(world); + time0 = platform::walltime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); - time1 = MPI_Wtime(); + time1 = platform::walltime(); time_fphi_uind += (time1 - time0); // store fractional reciprocal potentials for OPT method From ff709f5897c38a8d5d685a9957865dc5ae0dc27a Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 20 Jan 2023 16:29:16 -0500 Subject: [PATCH 158/181] 'include' for std::shuffle --- src/REAXFF/fix_reaxff_species.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index 6ad7b50226..caffcc08af 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -36,6 +36,7 @@ #include "pair_reaxff.h" #include "reaxff_defs.h" +#include #include #include #include From 846f00ce32d6e59cd7acd93b85a8d1eff4d2eea4 Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 20 Jan 2023 16:58:19 -0500 Subject: [PATCH 159/181] add citation --- src/REAXFF/fix_reaxff_species.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index caffcc08af..9bc70e7617 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -21,6 +21,7 @@ #include "atom.h" #include "atom_vec.h" +#include "citeme.h" #include "comm.h" #include "domain.h" #include "error.h" @@ -44,6 +45,17 @@ using namespace LAMMPS_NS; using namespace FixConst; +static const char cite_reaxff_species_delete[] = + "fix reaxff/species, 'delete' keyword: https://doi.org/10.1016/j.carbon.2022.11.002\n\n" + "@Article{Gissinger23,\n" + " author = {J. R. Gissinger, S. R. Zavada, J. G. Smith, J. Kemppainen, I. Gallegos, G. M. Odegard, E. J. Siochi, K. E. Wise},\n" + " title = {Predicting char yield of high-temperature resins},\n" + " journal = {Carbon},\n" + " year = 2023,\n" + " volume = 202,\n" + " pages = {336-347}\n" + "}\n\n"; + /* ---------------------------------------------------------------------- */ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : @@ -52,6 +64,8 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : x0(nullptr), BOCut(nullptr), fp(nullptr), pos(nullptr), fdel(nullptr), ele(nullptr), eletype(nullptr), filepos(nullptr), filedel(nullptr) { + if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete); + if (narg < 7) utils::missing_cmd_args(FLERR, "fix reaxff/species", error); force_reneighbor = 1; From 375fad6d2a132ddac5827b6f3b4b6b387e659970 Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 20 Jan 2023 17:13:56 -0500 Subject: [PATCH 160/181] parallel version --- src/REAXFF/fix_reaxff_species.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index 9bc70e7617..c3335a0397 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -906,11 +906,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) std::minstd_rand park_rng(rnd()); int *molrange; memory->create(molrange,Nmole,"reaxff/species:molrange"); - for (m = 0; m < Nmole; m++) - molrange[m] = m + 1; - // shuffle index when using rate_limit, in case order is biased - if (delete_Nlimit > 0) - std::shuffle(&molrange[0],&molrange[Nmole], park_rng); + if (comm->me == 0) { + for (m = 0; m < Nmole; m++) + molrange[m] = m + 1; + // shuffle index when using rate_limit, in case order is biased + if (delete_Nlimit > 0) + std::shuffle(&molrange[0],&molrange[Nmole], park_rng); + } + MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world); int this_delete_Tcount = 0; for (int mm = 0; mm < Nmole; mm++) { From f6ded5a7d742e600cab2c88878be137d65c68b94 Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 20 Jan 2023 17:36:46 -0500 Subject: [PATCH 161/181] reduce unnecessary communication --- src/REAXFF/fix_reaxff_species.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index c3335a0397..a9bab28003 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -906,14 +906,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec) std::minstd_rand park_rng(rnd()); int *molrange; memory->create(molrange,Nmole,"reaxff/species:molrange"); - if (comm->me == 0) { - for (m = 0; m < Nmole; m++) - molrange[m] = m + 1; + for (m = 0; m < Nmole; m++) + molrange[m] = m + 1; + if (delete_Nlimit > 0) { // shuffle index when using rate_limit, in case order is biased - if (delete_Nlimit > 0) + if (comm->me == 0) std::shuffle(&molrange[0],&molrange[Nmole], park_rng); + MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world); } - MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world); int this_delete_Tcount = 0; for (int mm = 0; mm < Nmole; mm++) { From 8537ccb840e406b2049291c2ddfccbb05ce90063 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sat, 21 Jan 2023 11:18:51 -0500 Subject: [PATCH 162/181] add CMake option to skip automatic download of large potential files --- cmake/CMakeLists.txt | 2 ++ cmake/Modules/LAMMPSUtils.cmake | 40 +++++++++++++++++---------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 0223750ace..767bbbfe34 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -566,6 +566,8 @@ RegisterStyles(${LAMMPS_SOURCE_DIR}) ######################################################## # Fetch missing external files and archives for packages ######################################################## +option(DOWNLOAD_POTENTIALS "Automatically download large potential files" ON) +mark_as_advanced(DOWNLOAD_POTENTIALS) foreach(PKG ${STANDARD_PACKAGES} ${EXTRA_PACKAGES} ${SUFFIX_PACKAGES}) if(PKG_${PKG}) FetchPotentials(${LAMMPS_SOURCE_DIR}/${PKG} ${LAMMPS_POTENTIALS_DIR}) diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake index 9602379403..d42f91f10e 100644 --- a/cmake/Modules/LAMMPSUtils.cmake +++ b/cmake/Modules/LAMMPSUtils.cmake @@ -118,25 +118,27 @@ endfunction(GenerateBinaryHeader) # fetch missing potential files function(FetchPotentials pkgfolder potfolder) - if(EXISTS "${pkgfolder}/potentials.txt") - file(STRINGS "${pkgfolder}/potentials.txt" linelist REGEX "^[^#].") - foreach(line ${linelist}) - string(FIND ${line} " " blank) - math(EXPR plusone "${blank}+1") - string(SUBSTRING ${line} 0 ${blank} pot) - string(SUBSTRING ${line} ${plusone} -1 sum) - if(EXISTS "${LAMMPS_POTENTIALS_DIR}/${pot}") - file(MD5 "${LAMMPS_POTENTIALS_DIR}/${pot}" oldsum) - endif() - if(NOT sum STREQUAL oldsum) - message(STATUS "Downloading external potential ${pot} from ${LAMMPS_POTENTIALS_URL}") - string(MD5 TMP_EXT "${CMAKE_BINARY_DIR}") - file(DOWNLOAD "${LAMMPS_POTENTIALS_URL}/${pot}.${sum}" "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}" - EXPECTED_HASH MD5=${sum} SHOW_PROGRESS) - file(COPY "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}" DESTINATION "${LAMMPS_POTENTIALS_DIR}") - file(RENAME "${LAMMPS_POTENTIALS_DIR}/${pot}.${TMP_EXT}" "${LAMMPS_POTENTIALS_DIR}/${pot}") - endif() - endforeach() + if(DOWNLOAD_POTENTIALS) + if(EXISTS "${pkgfolder}/potentials.txt") + file(STRINGS "${pkgfolder}/potentials.txt" linelist REGEX "^[^#].") + foreach(line ${linelist}) + string(FIND ${line} " " blank) + math(EXPR plusone "${blank}+1") + string(SUBSTRING ${line} 0 ${blank} pot) + string(SUBSTRING ${line} ${plusone} -1 sum) + if(EXISTS "${LAMMPS_POTENTIALS_DIR}/${pot}") + file(MD5 "${LAMMPS_POTENTIALS_DIR}/${pot}" oldsum) + endif() + if(NOT sum STREQUAL oldsum) + message(STATUS "Downloading external potential ${pot} from ${LAMMPS_POTENTIALS_URL}") + string(RANDOM LENGTH 10 TMP_EXT) + file(DOWNLOAD "${LAMMPS_POTENTIALS_URL}/${pot}.${sum}" "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}" + EXPECTED_HASH MD5=${sum} SHOW_PROGRESS) + file(COPY "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}" DESTINATION "${LAMMPS_POTENTIALS_DIR}") + file(RENAME "${LAMMPS_POTENTIALS_DIR}/${pot}.${TMP_EXT}" "${LAMMPS_POTENTIALS_DIR}/${pot}") + endif() + endforeach() + endif() endif() endfunction(FetchPotentials) From 658328dd9d57bc84d308affe0327217a42f9e947 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sun, 22 Jan 2023 17:24:15 -0600 Subject: [PATCH 163/181] Added a note in the amoeba doc page on the not-yet resolved issue with integrated GPUs, removed commented out and debugging stuffs in the AM/HP kernels --- doc/src/pair_amoeba.rst | 5 ++ lib/gpu/lal_amoeba.cu | 87 +--------------------------- lib/gpu/lal_hippo.cu | 123 +++++----------------------------------- 3 files changed, 20 insertions(+), 195 deletions(-) diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst index 113ae560f7..ab82fa5593 100644 --- a/doc/src/pair_amoeba.rst +++ b/doc/src/pair_amoeba.rst @@ -200,6 +200,11 @@ These pair styles can only be used via the *pair* keyword of the .. include:: accel_styles.rst +.. note:: + + There is a unresolved issue with the `amoeba/gpu` and `hippo/gpu` + pair styles with the OpenCL build when running on integrated GPUs. + ---------- Restrictions diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index b3bbabadc3..68d15cfb47 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -14,7 +14,7 @@ // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) -//#include + #include "lal_aux_fun1.h" #ifdef LAMMPS_SMALLBIG #define tagint int @@ -448,20 +448,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); if (iioff2) continue; - numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; @@ -825,12 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); - numtyp bn[4],bcn[3]; - /* - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - bn[0] = _erfc * rinv; - */ + numtyp bn[4], bcn[3]; bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; @@ -849,7 +822,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, if (damp != (numtyp)0.0) { numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype] if (pgamma != (numtyp)0.0) { - //damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); numtyp tmp = r*ucl_recip(damp); damp = pgamma * ucl_sqrt(tmp*tmp*tmp); if (damp < (numtyp)50.0) { @@ -860,7 +832,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, } } else { pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] - //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); numtyp tmp = r*ucl_recip(damp); damp = pgamma * (tmp*tmp*tmp); if (damp < (numtyp)50.0) { @@ -930,7 +901,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - //local_allocate_store_charge(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; @@ -939,8 +909,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, numtyp4* polar4 = (numtyp4*)(&extra[12*nall]); numtyp4* polar5 = (numtyp4*)(&extra[16*nall]); - //numtyp4 xi__; - if (iioff2) continue; - numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; @@ -1249,7 +1200,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; numtyp qkr = qkx*xr + qky*yr + qkz*zr; numtyp uir = uix*xr + uiy*yr + uiz*zr; - //numtyp uirp = uixp*xr + uiyp*yr + uizp*zr; numtyp ukr = ukx*xr + uky*yr + ukz*zr; numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr; @@ -1280,15 +1230,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp drc3[3],drc5[3],drc7[3]; numtyp urc3[3],urc5[3]; - numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[5]; - /* - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - bn[0] = _erfc * rinv; - */ bn[0] = ucl_erfc(ralpha) * rinv; numtyp alsq2 = (numtyp)2.0 * aewald*aewald; @@ -1318,7 +1262,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype] if (damp != (numtyp)0.0) { numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype] - //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0); numtyp tmp = r*ucl_recip(damp); damp = pgamma * (tmp*tmp*tmp); if (damp < (numtyp)50.0) { @@ -1614,9 +1557,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, // accumulate ufld and dufld to compute tep store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep); - // accumate force, energy and virial - //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, - // offset,eflag,vflag,ans,engv); store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } @@ -1746,17 +1686,6 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, int i = (igridx - nxlo_out) - nlpts; for (int ib = 0; ib < bsorder; ib++) { - /* - tq_1 = grid[k][j][i][0]; - tq_2 = grid[k][j][i][1]; - t0_1 += tq_1*thetai1[m][ib][0]; - t1_1 += tq_1*thetai1[m][ib][1]; - t2_1 += tq_1*thetai1[m][ib][2]; - t0_2 += tq_2*thetai1[m][ib][0]; - t1_2 += tq_2*thetai1[m][ib][1]; - t2_2 += tq_2*thetai1[m][ib][2]; - t3 += (tq_1+tq_2)*thetai1[m][ib][3]; - */ const int i1 = istart + ib; const numtyp4 tha1 = thetai1[i1]; const int gidx = my + i; // k*ngridxy + j*ngridx + i; @@ -1963,12 +1892,6 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1, int k = (igridz - nzlo_out) - nlpts; for (int kb = 0; kb < bsorder; kb++) { - /* - v0 = thetai3[m][kb][0]; - v1 = thetai3[m][kb][1]; - v2 = thetai3[m][kb][2]; - v3 = thetai3[m][kb][3]; - */ int i3 = istart + kb; numtyp4 tha3 = thetai3[i3]; numtyp v0 = tha3.x; @@ -1988,12 +1911,6 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1, int j = (igridy - nylo_out) - nlpts; for (int jb = 0; jb < bsorder; jb++) { - /* - u0 = thetai2[m][jb][0]; - u1 = thetai2[m][jb][1]; - u2 = thetai2[m][jb][2]; - u3 = thetai2[m][jb][3]; - */ int i2 = istart + jb; numtyp4 tha2 = thetai2[i2]; numtyp u0 = tha2.x; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index a5fca5cc80..0647a736a8 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -14,7 +14,7 @@ // *************************************************************************** #if defined(NV_KERNEL) || defined(USE_HIP) -#include + #include "lal_hippo_extra.h" #ifdef LAMMPS_SMALLBIG #define tagint int @@ -455,8 +455,6 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //numtyp qtmp; fetch(qtmp,i,q_tex); - //int itype=ix.w; // recalculate numj and nbor_end for use of the short nbor list if (dev_packed==dev_nbor) { @@ -467,7 +465,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, } const numtyp4 pol1i = polar1[i]; - //numtyp ci = pol1i.x; // rpole[i][0]; + //numtyp ci = pol1i.x; // rpole[i][0]; numtyp dix = pol1i.y; // rpole[i][1]; numtyp diy = pol1i.z; // rpole[i][2]; numtyp diz = pol1i.w; // rpole[i][3]; @@ -490,7 +488,6 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, int j = jextra & NEIGHMASK15; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; - //int jtype=jx.w; // Compute r12 numtyp xr = jx.x - ix.x; @@ -502,18 +499,18 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, const numtyp4 pol1j = polar1[j]; //numtyp ck = pol1j.x; // rpole[j][0]; - numtyp dkx = pol1j.y; // rpole[j][1]; - numtyp dky = pol1j.z; // rpole[j][2]; - numtyp dkz = pol1j.w; // rpole[j][3]; + numtyp dkx = pol1j.y; // rpole[j][1]; + numtyp dky = pol1j.z; // rpole[j][2]; + numtyp dkz = pol1j.w; // rpole[j][3]; const numtyp4 pol2j = polar2[j]; - numtyp qkxx = pol2j.x; // rpole[j][4]; - numtyp qkxy = pol2j.y; // rpole[j][5]; - numtyp qkxz = pol2j.z; // rpole[j][6]; - numtyp qkyy = pol2j.w; // rpole[j][8]; + numtyp qkxx = pol2j.x; // rpole[j][4]; + numtyp qkxy = pol2j.y; // rpole[j][5]; + numtyp qkxz = pol2j.z; // rpole[j][6]; + numtyp qkyy = pol2j.w; // rpole[j][8]; const numtyp4 pol3j = polar3[j]; - numtyp qkyz = pol3j.x; // rpole[j][9]; - numtyp qkzz = pol3j.y; // rpole[j][12]; - int jtype = pol3j.z; // amtype[j]; + numtyp qkyz = pol3j.x; // rpole[j][9]; + numtyp qkzz = pol3j.y; // rpole[j][12]; + int jtype = pol3j.z; // amtype[j]; numtyp sizk = coeff_rep[jtype].x; // sizpr[jtype]; numtyp dmpk = coeff_rep[jtype].y; // dmppr[jtype]; @@ -776,7 +773,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, int j = jextra & NEIGHMASK15; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; - //int jtype=jx.w; // Compute r12 numtyp xr = ix.x - jx.x; @@ -784,8 +780,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, numtyp zr = ix.z - jx.z; numtyp r2 = xr*xr + yr*yr + zr*zr; - //if (r2>off2) continue; - int jtype = polar3[j].z; // amtype[j]; int jclass = coeff_amtype[jtype].w; // amtype2class[jtype]; numtyp ck = coeff_amclass[jclass].x; // csix[jclass]; @@ -886,9 +880,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, } // iioff2) continue; - numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; numtyp dkx = pol1j.y; // rpole[j][1]; @@ -1090,11 +1078,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[6]; - /* - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - bn[0] = _erfc * rinv; - */ bn[0] = ucl_erfc(ralpha) * rinv; numtyp alsq2 = (numtyp)2.0 * aewald*aewald; @@ -1213,9 +1196,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, // accumulate tq store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep); - // accumate force, energy and virial: use _acc if not the first kernel - //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, - // offset,eflag,vflag,ans,engv); + // accumate force, energy and virial store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } @@ -1294,8 +1275,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, numtyp zr = jx.z - ix.z; numtyp r2 = xr*xr + yr*yr + zr*zr; - //if (r2>off2) continue; - numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1345,11 +1324,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[4]; - /* - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - bn[0] = _erfc * rinv; - */ bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; @@ -1439,9 +1413,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); numtyp4* polar4 = (numtyp4*)(&extra[12*nall]); numtyp4* polar5 = (numtyp4*)(&extra[16*nall]); - //numtyp4* polar6 = (numtyp4*)(&extra[20*nall]); - - //numtyp4 xi__; if (iioff2) continue; - numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; @@ -1494,7 +1461,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, const numtyp4 pol3j = polar3[j]; int jtype = pol3j.z; // amtype[j]; - //int jgroup = pol3j.w; // amgroup[j]; const numtyp4 pol4j = polar4[j]; numtyp ukx = pol4j.x; // uind[j][0]; numtyp uky = pol4j.y; // uind[j][1]; @@ -1516,11 +1482,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[4]; - /* - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - bn[0] = _erfc * rinv; - */ bn[0] = ucl_erfc(ralpha) * rinv; numtyp aefac = aesq2n; @@ -1546,9 +1507,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_, tdipdip[3] = -rr3ik + rr5ik*yr*yr; tdipdip[4] = rr5ik*yr*zr; tdipdip[5] = -rr3ik + rr5ik*zr*zr; - //if (i==0 && j == 10) - // printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n", - // i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]); numtyp fid[3]; fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz; @@ -1638,8 +1596,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - //numtyp qtmp; fetch(qtmp,i,q_tex); - //int itype=ix.w; // recalculate numj and nbor_end for use of the short nbor list if (dev_packed==dev_nbor) { @@ -1672,9 +1628,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, uiyp = pol5i.y; // uinp[i][1]; uizp = pol5i.z; // uinp[i][2]; - // debug: - // xi__ = ix; xi__.w = itype; - numtyp corei = coeff_amclass[itype].z; // pcore[iclass]; numtyp alphai = coeff_amclass[itype].w; // palpha[iclass]; numtyp vali = polar6[i].x; @@ -1692,8 +1645,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp zr = jx.z - ix.z; numtyp r2 = xr*xr + yr*yr + zr*zr; - //if (r2>off2) continue; - numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; @@ -1759,11 +1710,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[5]; - /* - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); - numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; - bn[0] = _erfc * rinv; - */ bn[0] = ucl_erfc(ralpha) * rinv; numtyp alsq2 = (numtyp)2.0 * aewald*aewald; @@ -1824,7 +1770,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir; dufld[5] += zr*tiz5 + zr*zr*tuir; - // get the field gradient for direct polarization force numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i; @@ -1962,7 +1907,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, numtyp frcz = (numtyp)-2.0 * depz; numtyp term1,term2,term3; - //numtyp term4,term5,term6,term7; // get the dEp/dR terms used for direct polarization force // poltyp == MUTUAL && hippo @@ -2039,8 +1983,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_, store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep); // accumate force, energy and virial - //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, - // offset,eflag,vflag,ans,engv); store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } @@ -2063,10 +2005,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, const int nxlo_out, const int ngridxy, const int ngridx) { - //int tid, ii, offset, i, n_stride; - //atom_info(t_per_atom,ii,tid,offset); - - int tid=THREAD_ID_X; int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; @@ -2125,12 +2063,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, int k = (igridz - nzlo_out) - nlpts; for (int kb = 0; kb < bsorder; kb++) { - /* - v0 = thetai3[m][kb][0]; - v1 = thetai3[m][kb][1]; - v2 = thetai3[m][kb][2]; - v3 = thetai3[m][kb][3]; - */ int i3 = istart + kb; numtyp4 tha3 = thetai3[i3]; numtyp v0 = tha3.x; @@ -2162,12 +2094,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, int j = (igridy - nylo_out) - nlpts; for (int jb = 0; jb < bsorder; jb++) { - /* - u0 = thetai2[m][jb][0]; - u1 = thetai2[m][jb][1]; - u2 = thetai2[m][jb][2]; - u3 = thetai2[m][jb][3]; - */ int i2 = istart + jb; numtyp4 tha2 = thetai2[i2]; numtyp u0 = tha2.x; @@ -2184,17 +2110,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, int i = (igridx - nxlo_out) - nlpts; for (int ib = 0; ib < bsorder; ib++) { - /* - tq_1 = grid[k][j][i][0]; - tq_2 = grid[k][j][i][1]; - t0_1 += tq_1*thetai1[m][ib][0]; - t1_1 += tq_1*thetai1[m][ib][1]; - t2_1 += tq_1*thetai1[m][ib][2]; - t0_2 += tq_2*thetai1[m][ib][0]; - t1_2 += tq_2*thetai1[m][ib][1]; - t2_2 += tq_2*thetai1[m][ib][2]; - t3 += (tq_1+tq_2)*thetai1[m][ib][3]; - */ int i1 = istart + ib; numtyp4 tha1 = thetai1[i1]; numtyp w0 = tha1.x; @@ -2403,12 +2318,6 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1, int k = (igridz - nzlo_out) - nlpts; for (int kb = 0; kb < bsorder; kb++) { - /* - v0 = thetai3[m][kb][0]; - v1 = thetai3[m][kb][1]; - v2 = thetai3[m][kb][2]; - v3 = thetai3[m][kb][3]; - */ int i3 = istart + kb; numtyp4 tha3 = thetai3[i3]; numtyp v0 = tha3.x; @@ -2428,12 +2337,6 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1, int j = (igridy - nylo_out) - nlpts; for (int jb = 0; jb < bsorder; jb++) { - /* - u0 = thetai2[m][jb][0]; - u1 = thetai2[m][jb][1]; - u2 = thetai2[m][jb][2]; - u3 = thetai2[m][jb][3]; - */ int i2 = istart + jb; numtyp4 tha2 = thetai2[i2]; numtyp u0 = tha2.x; From 8e79e2efa5a971372574ed28f9d441f5a9293aed Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Mon, 23 Jan 2023 00:18:42 -0600 Subject: [PATCH 164/181] More cleanup, fixed bugs with hippo fphi kernels for mixed precision --- lib/gpu/lal_amoeba.cu | 2 +- lib/gpu/lal_hippo.cu | 79 +++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 68d15cfb47..6317ba8d94 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1557,6 +1557,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, // accumulate ufld and dufld to compute tep store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep); + // accumate force, energy and virial store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } @@ -1834,7 +1835,6 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1, } } - /* ---------------------------------------------------------------------- fphi_mpole = multipole potential from grid fphi_mpole extracts the permanent multipole potential from diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 0647a736a8..1611e8aece 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -1996,10 +1996,10 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, const __global numtyp4 *restrict thetai2, const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, - const __global numtyp *restrict grid, - __global numtyp *restrict fdip_phi1, - __global numtyp *restrict fdip_phi2, - __global numtyp *restrict fdip_sum_phi, + const __global numtyp2 *restrict grid, + __global acctyp *restrict fdip_phi1, + __global acctyp *restrict fdip_phi2, + __global acctyp *restrict fdip_sum_phi, const int bsorder, const int inum, const int nzlo_out, const int nylo_out, const int nxlo_out, const int ngridxy, @@ -2010,12 +2010,12 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, if (ii Date: Mon, 23 Jan 2023 17:30:35 -0500 Subject: [PATCH 165/181] Update fix_reaxff_species.cpp --- src/REAXFF/fix_reaxff_species.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp index a9bab28003..ce04be2cc8 100644 --- a/src/REAXFF/fix_reaxff_species.cpp +++ b/src/REAXFF/fix_reaxff_species.cpp @@ -64,8 +64,6 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : x0(nullptr), BOCut(nullptr), fp(nullptr), pos(nullptr), fdel(nullptr), ele(nullptr), eletype(nullptr), filepos(nullptr), filedel(nullptr) { - if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete); - if (narg < 7) utils::missing_cmd_args(FLERR, "fix reaxff/species", error); force_reneighbor = 1; @@ -283,6 +281,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) : error->all(FLERR, "Incompatible combination fix reaxff/species command options"); if (delete_Nlimit > 0) { + if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete); memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount"); for (int i = 0; i < delete_Nsteps; i++) From 5014e0434170d0bfbff9532acdc766d0bf8979eb Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 24 Jan 2023 08:40:08 -0600 Subject: [PATCH 166/181] Removed commented out code, ensured that ic_kspace is not nullptr when call precompute_kspace for hippo/gpu --- lib/gpu/lal_base_amoeba.cpp | 11 +++-------- src/GPU/pair_hippo_gpu.cpp | 12 ++++++------ 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 21d9975b28..88dd10eab1 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -573,7 +573,8 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, int numel = _num_grid_points; if (_cgrid_brick.cols() == 0) { - _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); + int nsize=(int)(((double)numel)*1.1); + _cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY); } else if (numel > (int)_cgrid_brick.cols()) { _cgrid_brick.resize(numel); } @@ -689,13 +690,7 @@ int BaseAmoebaT::fphi_mpole() { const int BX=block_size(); const int GX=static_cast(ceil(static_cast(ainum)/BX)); - /* - const int cus = device->gpu->cus(); - while (GX < cus && GX > 1) { - BX /= 2; - GX=static_cast(ceil(static_cast(ainum)/BX)); - } - */ + time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_mpole.set_size(GX,BX); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 3049799433..8611c1b56a 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -475,12 +475,12 @@ void PairHippoGPU::induce() // allocate memory and make early host-device transfers // must be done before the first ufield0c - - hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, - thetai3, igrid, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out); + if (ic_kspace) + hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating From aaa918cbe74eabab131dbb7971d43265b96bf6ee Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 24 Jan 2023 17:05:48 -0600 Subject: [PATCH 167/181] Fixed bugs with access mode on the host side of thetai[1-3] --- lib/gpu/lal_base_amoeba.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 88dd10eab1..e80fa01c2b 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -484,9 +484,9 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder, if (_max_thetai_size == 0) { _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); - _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); - _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); + _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); From 40c8fcb03aab95e85908285aafe3bbabfdfa74e2 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Jan 2023 21:05:36 -0500 Subject: [PATCH 168/181] disallow using single precision FFTs with AMOEBA package --- cmake/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index a6956f5f5d..8184f9784d 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -527,6 +527,11 @@ foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE endif() endforeach() +# AMOEBA is not compatible with single precision FFTs +if(PKG_AMOEBA AND FFT_SINGLE) + message(FATAL_ERROR "Package AMOEBA is not compatible with single precision FFTs") +endif() + # optionally enable building script wrappers using swig option(WITH_SWIG "Build scripting language wrappers with SWIG" OFF) if(WITH_SWIG) From dec3afe5956c37d2e514bdd1c6345bbecc7b299e Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Jan 2023 21:15:37 -0500 Subject: [PATCH 169/181] make synchronization for timers optional. only enable with "timer sync" --- src/AMOEBA/amoeba_convolution.cpp | 9 +++++---- src/AMOEBA/amoeba_induce.cpp | 9 +++++---- src/AMOEBA/amoeba_multipole.cpp | 3 ++- src/AMOEBA/amoeba_polar.cpp | 3 ++- src/AMOEBA/pair_amoeba.cpp | 3 ++- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index 609df1184e..e58bb33b41 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -22,6 +22,7 @@ #include "memory.h" #include "neighbor.h" #include "remap_wrap.h" +#include "timer.h" #include #include @@ -328,7 +329,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d() double time0,time1; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); // perform forward FFT @@ -393,7 +394,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d() double time0,time1; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); // perform forward FFT @@ -443,7 +444,7 @@ void *AmoebaConvolution::post_convolution_3d() double time0,time1; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); fft2->compute(cfft,cfft,FFT3d::BACKWARD); @@ -494,7 +495,7 @@ void *AmoebaConvolution::post_convolution_4d() double time0,time1; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); fft2->compute(cfft,cfft,FFT3d::BACKWARD); diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 7ff9fe7121..6ac8148c59 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -24,6 +24,7 @@ #include "math_special.h" #include "my_page.h" #include "neigh_list.h" +#include "timer.h" #include @@ -545,7 +546,7 @@ void PairAmoeba::ufield0c(double **field, double **fieldp) } double time0, time1, time2; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); // get the real space portion of the mutual field @@ -795,7 +796,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp) // get the reciprocal space part of the permanent field double time0, time1, time2; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); if (polar_kspace_flag) udirect1(field); @@ -870,7 +871,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // map 2 values to grid - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); grid_uind(fuind,fuinp,gridpre); @@ -915,7 +916,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // get potential - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index f302194193..848e1a13cb 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -21,6 +21,7 @@ #include "math_const.h" #include "math_special.h" #include "neigh_list.h" +#include "timer.h" #include @@ -80,7 +81,7 @@ void PairAmoeba::multipole() felec = electric / am_dielectric; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); // compute the real space part of the Ewald summation diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index e2b85ed22c..e817e706dc 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -21,6 +21,7 @@ #include "math_const.h" #include "math_special.h" #include "neigh_list.h" +#include "timer.h" #include #include @@ -78,7 +79,7 @@ void PairAmoeba::polar() // compute the real space part of the dipole interactions - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); if (polar_rspace_flag) polar_real(); diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index a1b288348a..0812fe43f0 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -29,6 +29,7 @@ #include "my_page.h" #include "neigh_list.h" #include "neighbor.h" +#include "timer.h" #include "update.h" #include @@ -371,7 +372,7 @@ void PairAmoeba::compute(int eflag, int vflag) double time0,time1,time2,time3,time4,time5,time6,time7,time8; - MPI_Barrier(world); + if (timer->has_sync()) MPI_Barrier(world); time0 = platform::walltime(); // if reneighboring step: From b17689af6be379c67f9332ab88790528307c4f3b Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Jan 2023 21:28:08 -0500 Subject: [PATCH 170/181] doc fixes --- doc/src/fix_rigid.rst | 4 ++-- doc/src/pair_amoeba.rst | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/src/fix_rigid.rst b/doc/src/fix_rigid.rst index 9a958e50d1..3a2477f90a 100644 --- a/doc/src/fix_rigid.rst +++ b/doc/src/fix_rigid.rst @@ -732,8 +732,8 @@ choices: * Use one of the 4 NPT or NPH styles for the rigid bodies. Use the *dilate* all option so that it will dilate the positions of the - *non-rigid particles as well. Use :doc:`fix nvt ` (or any - *other thermostat) for the non-rigid particles. + non-rigid particles as well. Use :doc:`fix nvt ` (or any + other thermostat) for the non-rigid particles. * Use :doc:`fix npt ` for the group of non-rigid particles. Use the *dilate* all option so that it will dilate the center-of-mass positions of the rigid bodies as well. Use one of the 4 NVE or 2 NVT diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst index ab82fa5593..79b3daf22f 100644 --- a/doc/src/pair_amoeba.rst +++ b/doc/src/pair_amoeba.rst @@ -202,8 +202,9 @@ These pair styles can only be used via the *pair* keyword of the .. note:: - There is a unresolved issue with the `amoeba/gpu` and `hippo/gpu` - pair styles with the OpenCL build when running on integrated GPUs. + Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu' + when compiling the GPU package for OpenCL has a few known issues + when running on integrated GPUs and the calculation may crash. ---------- From 878681999321df0e1cb2a0cfaff4678b0fd81e0f Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Jan 2023 22:31:49 -0500 Subject: [PATCH 171/181] use FFT_SCALAR more consistently to perhaps support single precision FFT some time also, use "override" instead of virtual and add a forgotten virtual --- src/AMOEBA/amoeba_convolution.cpp | 4 ++-- src/AMOEBA/amoeba_convolution.h | 10 ++++----- src/AMOEBA/amoeba_dispersion.cpp | 4 ++-- src/AMOEBA/amoeba_induce.cpp | 12 +++++------ src/AMOEBA/amoeba_kspace.cpp | 10 ++++----- src/AMOEBA/amoeba_multipole.cpp | 10 +++------ src/AMOEBA/amoeba_polar.cpp | 28 ++++++++++++------------ src/AMOEBA/pair_amoeba.h | 12 +++++------ src/GPU/pair_amoeba_gpu.cpp | 36 +++++++++++++++---------------- src/GPU/pair_amoeba_gpu.h | 25 +++++++++++---------- src/GPU/pair_hippo_gpu.cpp | 10 ++++----- src/GPU/pair_hippo_gpu.h | 26 +++++++++++----------- 12 files changed, 91 insertions(+), 96 deletions(-) diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index e58bb33b41..ae3dbf16c4 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -338,7 +338,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d() time1 = platform::walltime(); if (SCALE) { - double scale = 1.0/nfft_global; + FFT_SCALAR scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } @@ -403,7 +403,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d() time1 = platform::walltime(); if (SCALE) { - double scale = 1.0/nfft_global; + FFT_SCALAR scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h index 44dc5b1687..bed65149ec 100644 --- a/src/AMOEBA/amoeba_convolution.h +++ b/src/AMOEBA/amoeba_convolution.h @@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers { int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out; int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft; bigint nfft_global; // nx * ny * nz - double *grid_brick_start; // lower left corner of (c)grid_brick data + FFT_SCALAR *grid_brick_start; // lower left corner of (c)grid_brick data AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int); ~AmoebaConvolution(); @@ -61,14 +61,14 @@ class AmoebaConvolution : protected Pointers { class Grid3d *gc; class Remap *remap; - double ***grid_brick; // 3d real brick grid with ghosts - double ****cgrid_brick; // 4d complex brick grid with ghosts + FFT_SCALAR ***grid_brick; // 3d real brick grid with ghosts + FFT_SCALAR ****cgrid_brick; // 4d complex brick grid with ghosts FFT_SCALAR *grid_fft; // 3d FFT grid as 1d vector FFT_SCALAR *cfft; // 3d complex FFT grid as 1d vector - double *gc_buf1, *gc_buf2; // buffers for GridComm - double *remap_buf; // buffer for Remap + FFT_SCALAR *gc_buf1, *gc_buf2; // buffers for GridComm + FFT_SCALAR *remap_buf; // buffer for Remap void allocate_grid(); void deallocate_grid(); diff --git a/src/AMOEBA/amoeba_dispersion.cpp b/src/AMOEBA/amoeba_dispersion.cpp index f3af921d85..cc283f22d2 100644 --- a/src/AMOEBA/amoeba_dispersion.cpp +++ b/src/AMOEBA/amoeba_dispersion.cpp @@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) d_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero(); // map atoms to grid @@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = d_kspace->pre_convolution(); + FFT_SCALAR *gridfft = d_kspace->pre_convolution(); // --------------------- // convolution operation diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp index 6ac8148c59..ecc20a198c 100644 --- a/src/AMOEBA/amoeba_induce.cpp +++ b/src/AMOEBA/amoeba_induce.cpp @@ -867,7 +867,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // gridpre = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre = (double ****) ic_kspace->zero(); + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); // map 2 values to grid @@ -882,7 +882,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = ic_kspace->pre_convolution(); + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); // --------------------- // convolution operation @@ -912,7 +912,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp) // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) ic_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); // get potential @@ -1090,7 +1090,7 @@ void PairAmoeba::udirect1(double **field) // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by setup() - double ***gridpre = (double ***) i_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero(); // map multipole moments to grid @@ -1099,7 +1099,7 @@ void PairAmoeba::udirect1(double **field) // pre-convolution operations including forward FFT // gridfft = my 1d portion of complex 3d grid in FFT decomp - double *gridfft = i_kspace->pre_convolution(); + FFT_SCALAR *gridfft = i_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1144,7 +1144,7 @@ void PairAmoeba::udirect1(double **field) // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) i_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution(); // get potential diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp index c47e734c5e..6d2fb64dd6 100644 --- a/src/AMOEBA/amoeba_kspace.cpp +++ b/src/AMOEBA/amoeba_kspace.cpp @@ -523,7 +523,7 @@ void PairAmoeba::frac_to_cart() grid_mpole maps fractional atomic multipoles to PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_mpole(double **fmp, double ***grid) +void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid) { int i,j,k,m,ib,jb,kb; double v0,u0,t0; @@ -596,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid) the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairAmoeba::fphi_mpole(double ***grid, double **fphi) +void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi) { int i,j,k,m,ib,jb,kb; double v0,v1,v2,v3; @@ -740,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi) grid_uind maps fractional induced dipoles to the PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid) +void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid) { int i,j,k,m,ib,jb,kb; double v0,u0,t0; @@ -791,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid) fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1, +void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, double **fdip_phi2, double **fdip_sum_phi) { int i,j,k,m,ib,jb,kb; @@ -1040,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1, grid_disp maps dispersion coefficients to PME grid ------------------------------------------------------------------------- */ -void PairAmoeba::grid_disp(double ***grid) +void PairAmoeba::grid_disp(FFT_SCALAR ***grid) { int i,j,k,m,ib,jb,kb,itype,iclass; double v0,u0,t0; diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp index 848e1a13cb..a1503a91f3 100644 --- a/src/AMOEBA/amoeba_multipole.cpp +++ b/src/AMOEBA/amoeba_multipole.cpp @@ -452,10 +452,6 @@ void PairAmoeba::multipole_real() rr9 = bn[4] - scalek*rr9; rr11 = bn[5] - scalek*rr11; e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9; - if (i == 0 && j < 10) { - //printf("j = %d: scalek = %f; rr11 = %f; terms: %f %f %f %f %f\n", j, scalek, rr11, term1, term2, term3, term4, term5); - //printf("j = %d: felec = %f; rr1 = %f; bn0 = %f\n", j, felec, rr1, bn[0]); - } // find standard multipole intermediates for force and torque @@ -662,7 +658,7 @@ void PairAmoeba::multipole_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpre = (double ***) m_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero(); // map atoms to grid @@ -671,7 +667,7 @@ void PairAmoeba::multipole_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = m_kspace->pre_convolution(); + FFT_SCALAR *gridfft = m_kspace->pre_convolution(); // --------------------- // convolution operation @@ -742,7 +738,7 @@ void PairAmoeba::multipole_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) m_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution(); // get potential diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp index e817e706dc..3c51426beb 100644 --- a/src/AMOEBA/amoeba_polar.cpp +++ b/src/AMOEBA/amoeba_polar.cpp @@ -1340,7 +1340,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1349,7 +1349,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1399,7 +1399,7 @@ void PairAmoeba::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) p_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution(); // get potential @@ -1432,7 +1432,7 @@ void PairAmoeba::polar_kspace() // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre2 = (double ****) pc_kspace->zero(); + FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero(); // map 2 values to grid @@ -1441,7 +1441,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = pc_kspace->pre_convolution(); + FFT_SCALAR *gridfft = pc_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1464,7 +1464,7 @@ void PairAmoeba::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) pc_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution(); // get potential @@ -1870,7 +1870,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1900,7 +1900,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1909,7 +1909,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1966,7 +1966,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1975,12 +1975,12 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // gridfft1 = copy of first FFT int nfft_owned = p_kspace->nfft_owned; - memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double)); + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); // assign ??? to the PME grid @@ -1995,7 +1995,7 @@ void PairAmoeba::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -2004,7 +2004,7 @@ void PairAmoeba::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index f14be4bd11..cdeee6c95f 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -381,7 +381,7 @@ class PairAmoeba : public Pair { virtual void induce(); void ulspred(); - void ufield0c(double **, double **); + virtual void ufield0c(double **, double **); void uscale0b(int, double **, double **, double **, double **); void dfield0c(double **, double **); virtual void umutual1(double **, double **); @@ -407,11 +407,11 @@ class PairAmoeba : public Pair { void fphi_to_cphi(double **, double **); void frac_to_cart(); - void grid_mpole(double **, double ***); - void fphi_mpole(double ***, double **); - void grid_uind(double **, double **, double ****); - virtual void fphi_uind(double ****, double **, double **, double **); - void grid_disp(double ***); + void grid_mpole(double **, FFT_SCALAR ***); + void fphi_mpole(FFT_SCALAR ***, double **); + void grid_uind(double **, double **, FFT_SCALAR ****); + virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **); + void grid_disp(FFT_SCALAR ***); void kewald(); void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 34605725a5..4213946f38 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -1050,7 +1050,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // gridpre = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre = (double ****) ic_kspace->zero(); + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); // map 2 values to grid @@ -1066,7 +1066,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = ic_kspace->pre_convolution(); + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1096,7 +1096,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) ic_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); // get potential @@ -1150,7 +1150,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, +void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, double **fdip_phi2, double **fdip_sum_phi) { if (!gpu_fphi_uind_ready) { @@ -1422,7 +1422,7 @@ void PairAmoebaGPU::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1431,7 +1431,7 @@ void PairAmoebaGPU::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1481,7 +1481,7 @@ void PairAmoebaGPU::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values - double ***gridpost = (double ***) p_kspace->post_convolution(); + FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution(); // get potential @@ -1539,7 +1539,7 @@ void PairAmoebaGPU::polar_kspace() // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre2 = (double ****) pc_kspace->zero(); + FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero(); // map 2 values to grid @@ -1548,7 +1548,7 @@ void PairAmoebaGPU::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = pc_kspace->pre_convolution(); + FFT_SCALAR *gridfft = pc_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1571,7 +1571,7 @@ void PairAmoebaGPU::polar_kspace() // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) pc_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution(); // get potential @@ -1819,7 +1819,7 @@ void PairAmoebaGPU::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1849,7 +1849,7 @@ void PairAmoebaGPU::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1858,7 +1858,7 @@ void PairAmoebaGPU::polar_kspace() // pre-convolution operations including forward FFT // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1915,7 +1915,7 @@ void PairAmoebaGPU::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() - double ***gridpre = (double ***) p_kspace->zero(); + FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1924,12 +1924,12 @@ void PairAmoebaGPU::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // gridfft1 = copy of first FFT int nfft_owned = p_kspace->nfft_owned; - memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double)); + memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); // assign ??? to the PME grid @@ -1944,7 +1944,7 @@ void PairAmoebaGPU::polar_kspace() // gridpre = my portion of 3d grid in brick decomp w/ ghost values - gridpre = (double ***) p_kspace->zero(); + gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid @@ -1953,7 +1953,7 @@ void PairAmoebaGPU::polar_kspace() // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector - double *gridfft2 = p_kspace->pre_convolution(); + FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index b7230594c5..c9b9b73a58 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -27,23 +27,22 @@ namespace LAMMPS_NS { class PairAmoebaGPU : public PairAmoeba { public: PairAmoebaGPU(LAMMPS *lmp); - ~PairAmoebaGPU(); - void init_style(); - double memory_usage(); + ~PairAmoebaGPU() override; + void init_style() override; + double memory_usage() override; enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; - virtual void induce(); + void induce() override; - //virtual void dispersion_real(); - virtual void multipole_real(); - virtual void udirect2b(double **, double **); - virtual void umutual1(double **, double **); - virtual void fphi_uind(double ****, double **, double **, double **); - virtual void umutual2b(double **, double **); - virtual void ufield0c(double **, double **); - virtual void polar_real(); - virtual void polar_kspace(); + void multipole_real() override; + void udirect2b(double **, double **) override; + void umutual1(double **, double **) override; + void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override; + void umutual2b(double **, double **) override; + void ufield0c(double **, double **) override; + void polar_real() override; + void polar_kspace() override; private: int gpu_mode; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 8611c1b56a..83c72d5252 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -1124,7 +1124,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) // gridpre = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpre = (double ****) ic_kspace->zero(); + FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); // map 2 values to grid @@ -1140,7 +1140,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition - double *gridfft = ic_kspace->pre_convolution(); + FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); // --------------------- // convolution operation @@ -1170,7 +1170,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values - double ****gridpost = (double ****) ic_kspace->post_convolution(); + FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); // get potential @@ -1231,8 +1231,8 @@ void PairHippoGPU::umutual1(double **field, double **fieldp) fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ -void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1, - double **fdip_phi2, double **fdip_sum_phi) +void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) { if (!gpu_fphi_uind_ready) { PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index 44bebd29f3..7955c97470 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -27,23 +27,23 @@ namespace LAMMPS_NS { class PairHippoGPU : public PairAmoeba { public: PairHippoGPU(LAMMPS *lmp); - ~PairHippoGPU(); - void init_style(); - double memory_usage(); + ~PairHippoGPU() override; + void init_style() override; + double memory_usage() override; enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; - virtual void induce(); + void induce() override; - virtual void repulsion(); - virtual void dispersion_real(); - virtual void multipole_real(); - virtual void udirect2b(double **, double **); - virtual void umutual1(double **, double **); - virtual void fphi_uind(double ****, double **, double **, double **); - virtual void umutual2b(double **, double **); - virtual void ufield0c(double **, double **); - virtual void polar_real(); + void repulsion() override; + void dispersion_real() override; + void multipole_real() override; + void udirect2b(double **, double **) override; + void umutual1(double **, double **) override; + void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override; + void umutual2b(double **, double **) override; + void ufield0c(double **, double **) override; + void polar_real() override; private: int gpu_mode; From 6c63d7dcb92553dd9f5e284c6db8ef3a6c2b5765 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Jan 2023 22:54:47 -0500 Subject: [PATCH 172/181] single precision FFTs are now supported on the CPU --- cmake/CMakeLists.txt | 5 ----- cmake/Modules/Packages/GPU.cmake | 4 ++++ doc/src/pair_amoeba.rst | 3 +++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 8184f9784d..a6956f5f5d 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -527,11 +527,6 @@ foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE endif() endforeach() -# AMOEBA is not compatible with single precision FFTs -if(PKG_AMOEBA AND FFT_SINGLE) - message(FATAL_ERROR "Package AMOEBA is not compatible with single precision FFTs") -endif() - # optionally enable building script wrappers using swig option(WITH_SWIG "Build scripting language wrappers with SWIG" OFF) if(WITH_SWIG) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 89e15e548b..2c766a2540 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -31,6 +31,10 @@ endif() option(GPU_DEBUG "Enable debugging code of the GPU package" OFF) mark_as_advanced(GPU_DEBUG) +if(PKG_AMOEBA AND FFT_SINGLE) + message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT") +endif() + file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp) file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu) diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst index 79b3daf22f..6ef92a6938 100644 --- a/doc/src/pair_amoeba.rst +++ b/doc/src/pair_amoeba.rst @@ -206,6 +206,9 @@ These pair styles can only be used via the *pair* keyword of the when compiling the GPU package for OpenCL has a few known issues when running on integrated GPUs and the calculation may crash. + The GPU accelerated pair styles are also not (yet) compatible + with single precision FFTs. + ---------- Restrictions From c744be70602631afe1d66aa7876823504d207a2b Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Jan 2023 12:51:52 -0500 Subject: [PATCH 173/181] forcibly disable COMPRESS package is zlib is not found --- cmake/Modules/Packages/COMPRESS.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/Modules/Packages/COMPRESS.cmake b/cmake/Modules/Packages/COMPRESS.cmake index bdcf1aa3f8..4e1ab846a7 100644 --- a/cmake/Modules/Packages/COMPRESS.cmake +++ b/cmake/Modules/Packages/COMPRESS.cmake @@ -1,4 +1,9 @@ -find_package(ZLIB REQUIRED) +find_package(ZLIB) +if(NOT ZLIB_FOUND) + message(WARNING "No Zlib development support found. Disabling COMPRESS package...") + set(PKG_COMPRESS OFF CACHE BOOL "" FORCE) + return() +endif() target_link_libraries(lammps PRIVATE ZLIB::ZLIB) find_package(PkgConfig QUIET) From 4c996eed3beb9f970231c9fcbb99e096f93bfd44 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Jan 2023 23:22:55 -0500 Subject: [PATCH 174/181] auto-enabling prerequisite packages with CMake --- cmake/Modules/LAMMPSUtils.cmake | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake index d42f91f10e..9b42dafc44 100644 --- a/cmake/Modules/LAMMPSUtils.cmake +++ b/cmake/Modules/LAMMPSUtils.cmake @@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir) endfunction() macro(pkg_depends PKG1 PKG2) - if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2})) - message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package") + if(DEFINED BUILD_${PKG2}) + if(PKG_${PKG1} AND NOT BUILD_${PKG2}) + message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON") + endif() + elseif(DEFINED PKG_${PKG2}) + if(PKG_${PKG1} AND NOT PKG_${PKG2}) + message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.") + set(PKG_${PKG2} ON CACHE BOOL "" FORCE) + endif() endif() endmacro() From b206b4d1f63d724fb6b5151e8d70cc938dfe81fb Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 24 Jan 2023 23:55:30 -0600 Subject: [PATCH 175/181] Fixed bugs with hippo/gpu for single- and mixed- precisions --- src/GPU/pair_hippo_gpu.cpp | 121 ++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 35 deletions(-) diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 83c72d5252..0538096cc8 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -849,24 +849,44 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (udirect1) int nlocal = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; + if (tq_single) { + auto field_ptr = (float *)fieldp_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; - } + auto fieldp_ptr = (float *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + } else { + + auto field_ptr = (double *)fieldp_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + auto fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + } } /* ---------------------------------------------------------------------- @@ -1246,30 +1266,61 @@ void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, &fdip_sum_phi_pinned); int nlocal = atom->nlocal; - double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 10; m++) { - fdip_phi1[i][m] = _fdip_phi1_ptr[n]; - n += nlocal; + if (tq_single) { + auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } } - } - double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 10; m++) { - fdip_phi2[i][m] = _fdip_phi2_ptr[n]; - n += nlocal; + auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } } - } - double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; - for (int i = 0; i < nlocal; i++) { - int n = i; - for (int m = 0; m < 20; m++) { - fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; - n += nlocal; + auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } + } + + } else { + + auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi1[i][m] = _fdip_phi1_ptr[n]; + n += nlocal; + } + } + + auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 10; m++) { + fdip_phi2[i][m] = _fdip_phi2_ptr[n]; + n += nlocal; + } + } + + auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int n = i; + for (int m = 0; m < 20; m++) { + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; + n += nlocal; + } } } } From adf43d7feefb5a65b6c3d0ddef66190e28c42cc8 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 25 Jan 2023 00:02:25 -0600 Subject: [PATCH 176/181] Fixed the issues with some OpenCL implementation to avoid errors casting changing the pointer address spaces --- lib/gpu/lal_amoeba.cu | 51 ++++++++-------- lib/gpu/lal_atom.cpp | 2 +- lib/gpu/lal_atom.h | 2 +- lib/gpu/lal_base_amoeba.cpp | 49 +++++++++------- lib/gpu/lal_hippo.cu | 114 +++++++++++++++++------------------- 5 files changed, 106 insertions(+), 112 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 6317ba8d94..f572d3ebd0 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -410,7 +410,7 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff, const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, @@ -442,10 +442,10 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, acctyp4 tq; tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; - - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; if (iioff2) continue; - numtyp r = ucl_sqrt(r2); const numtyp4 pol1j = polar1[j]; numtyp ck = pol1j.x; // rpole[j][0]; @@ -583,12 +581,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, numtyp rr11 = (numtyp)9.0 * rr9 * r2inv; // calculate the real space Ewald error function terms - + numtyp ralpha = aewald * r; numtyp exp2a = ucl_exp(-ralpha*ralpha); numtyp bn[6]; bn[0] = ucl_erfc(ralpha) * rinv; - + numtyp alsq2 = (numtyp)2.0 * aewald*aewald; numtyp alsq2n = (numtyp)0.0; if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); @@ -691,7 +689,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff, const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, @@ -707,14 +705,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - //local_allocate_store_charge(); + local_allocate_store_ufld(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; if (iioff2) continue; - numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_rsqrt(r2); numtyp r2inv = rinv*rinv; @@ -1049,7 +1046,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff, const __global numtyp4 *restrict sp_amoeba, const __global int *dev_nbor, @@ -1068,7 +1065,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, atom_info(t_per_atom,ii,tid,offset); int n_stride; - local_allocate_store_ufld(); local_allocate_store_charge(); acctyp4 f; @@ -1086,11 +1082,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, for (int l=0; l<6; l++) dufld[l]=(acctyp)0; numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - numtyp4* polar4 = (numtyp4*)(&extra[12*nall]); - numtyp4* polar5 = (numtyp4*)(&extra[16*nall]); + + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; + const __global numtyp4* polar4 = &extra[3*nall]; + const __global numtyp4* polar5 = &extra[4*nall]; if (ii0) - bytes+=_extra_fields*sizeof(numtyp); + bytes+=_extra_fields*sizeof(numtyp4); return bytes; } diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 4b29d76cb1..771c2a3571 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -516,7 +516,7 @@ class Atom { /// Velocities UCL_Vector v; /// Extras - UCL_Vector extra; + UCL_Vector extra; #ifdef GPU_CAST UCL_Vector x_cast; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index e80fa01c2b..09d7386461 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -90,7 +90,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, bool vel = false; _extra_fields = 24; // round up to accomodate quadruples of numtyp values // rpole 13; uind 3; uinp 3; amtype, amgroup; pval - int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields); + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); if (success!=0) return success; @@ -820,35 +820,35 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, atom->extra_data_unavail(); int _nall=atom->nall(); - numtyp *pextra=reinterpret_cast(&(atom->extra[0])); + numtyp4 *pextra=reinterpret_cast(&(atom->extra[0])); int n = 0; - int nstride = 4; + int nstride = 1; //4; if (rpole) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = rpole[i][0]; - pextra[idx+1] = rpole[i][1]; - pextra[idx+2] = rpole[i][2]; - pextra[idx+3] = rpole[i][3]; + pextra[idx].x = rpole[i][0]; + pextra[idx].y = rpole[i][1]; + pextra[idx].z = rpole[i][2]; + pextra[idx].w = rpole[i][3]; } n += nstride*_nall; for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = rpole[i][4]; - pextra[idx+1] = rpole[i][5]; - pextra[idx+2] = rpole[i][6]; - pextra[idx+3] = rpole[i][8]; + pextra[idx].x = rpole[i][4]; + pextra[idx].y = rpole[i][5]; + pextra[idx].z = rpole[i][6]; + pextra[idx].w = rpole[i][8]; } n += nstride*_nall; for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = rpole[i][9]; - pextra[idx+1] = rpole[i][12]; - pextra[idx+2] = (numtyp)amtype[i]; - pextra[idx+3] = (numtyp)amgroup[i]; + pextra[idx].x = rpole[i][9]; + pextra[idx].y = rpole[i][12]; + pextra[idx].z = (numtyp)amtype[i]; + pextra[idx].w = (numtyp)amgroup[i]; } } else { n += 2*nstride*_nall; @@ -858,9 +858,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, if (uind) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = uind[i][0]; - pextra[idx+1] = uind[i][1]; - pextra[idx+2] = uind[i][2]; + pextra[idx].x = uind[i][0]; + pextra[idx].y = uind[i][1]; + pextra[idx].z = uind[i][2]; + pextra[idx].w = 0; } } @@ -868,9 +869,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, if (uinp) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = uinp[i][0]; - pextra[idx+1] = uinp[i][1]; - pextra[idx+2] = uinp[i][2]; + pextra[idx].x = uinp[i][0]; + pextra[idx].y = uinp[i][1]; + pextra[idx].z = uinp[i][2]; + pextra[idx].w = 0; } } @@ -878,7 +880,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, if (pval) { for (int i = 0; i < _nall; i++) { int idx = n+i*nstride; - pextra[idx] = pval[i]; + pextra[idx].x = pval[i]; + pextra[idx].y = 0; + pextra[idx].z = 0; + pextra[idx].w = 0; } } } diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 1611e8aece..99e20db223 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -410,7 +410,7 @@ _texture( q_tex,int2); ------------------------------------------------------------------------- */ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff_rep, const __global numtyp4 *restrict sp_nonpolar, const __global int *dev_nbor, @@ -444,9 +444,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, acctyp4 tq; tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; if (iioff2) continue; - const numtyp4 pol1j = polar1[j]; //numtyp ck = pol1j.x; // rpole[j][0]; numtyp dkx = pol1j.y; // rpole[j][1]; @@ -712,7 +710,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, const __global numtyp4 *restrict sp_nonpolar, @@ -741,7 +739,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, for (int l=0; l<6; l++) virial[l]=(acctyp)0; } - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + const __global numtyp4* polar3 = &extra[2*nall]; if (ii (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald); + int m; for (m = 1; m < 6; m++) { numtyp bfac = (numtyp) (m+m-1); alsq2n = alsq2 * alsq2n; @@ -1208,32 +1203,32 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, + const __global numtyp4 *restrict extra, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, const __global numtyp4 *restrict sp_polar, const __global int *dev_nbor, - const __global int *dev_packed, - const __global int *dev_short_nbor, - __global acctyp4 *restrict fieldp, - const int inum, const int nall, - const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp off2, - const numtyp polar_dscale, const numtyp polar_uscale) + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); int n_stride; - //local_allocate_store_charge(); + local_allocate_store_charge(); acctyp _fieldp[6]; for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; - numtyp4* polar1 = (numtyp4*)(&extra[0]); - numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); - numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); - numtyp4* polar6 = (numtyp4*)(&extra[20*nall]); + const __global numtyp4* polar1 = &extra[0]; + const __global numtyp4* polar2 = &extra[nall]; + const __global numtyp4* polar3 = &extra[2*nall]; + const __global numtyp4* polar6 = &extra[5*nall]; if (ii Date: Wed, 25 Jan 2023 02:35:10 -0500 Subject: [PATCH 177/181] fix segfault from accessing float array as double. use introspection to detect --- src/GPU/pair_hippo_gpu.cpp | 54 +++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 0538096cc8..a12a7e1907 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -26,6 +26,7 @@ #include "fix_store_peratom.h" #include "force.h" #include "gpu_extra.h" +#include "info.h" #include "math_const.h" #include "memory.h" #include "my_page.h" @@ -886,7 +887,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) fieldp[i][1] += fieldp_ptr[idx+1]; fieldp[i][2] += fieldp_ptr[idx+2]; } - } + } } /* ---------------------------------------------------------------------- @@ -1077,24 +1078,45 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (umutual1 and self) hippo_gpu_update_fieldp(&fieldp_pinned); - int inum = atom->nlocal; - double *field_ptr = (double *)fieldp_pinned; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - field[i][0] += field_ptr[idx]; - field[i][1] += field_ptr[idx+1]; - field[i][2] += field_ptr[idx+2]; - } + if (Info::has_accelerator_feature("GPU", "precision", "single")) { + float *field_ptr = (float *)fieldp_pinned; - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; - for (int i = 0; i < nlocal; i++) { - int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + float* fieldp_ptr = (float *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } + + } else { + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } } // accumulate timing information From e068b14969a76478058c5ed24e0ab91ad903fd4d Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 25 Jan 2023 02:56:05 -0500 Subject: [PATCH 178/181] make consistent and simplify --- src/GPU/pair_amoeba_gpu.cpp | 46 +++++++++++++-------------- src/GPU/pair_hippo_gpu.cpp | 63 ++++++++++++++++--------------------- 2 files changed, 48 insertions(+), 61 deletions(-) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 4213946f38..941050cf04 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -203,7 +203,7 @@ void PairAmoebaGPU::init_style() if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); - tq_single = tq_size != sizeof(double); + tq_single = (tq_size == sizeof(float)); // replace with the gpu counterpart @@ -285,10 +285,10 @@ void PairAmoebaGPU::multipole_real() // reference to the tep array from GPU lib if (tq_single) { - float *tq_ptr = (float *)tq_pinned; + auto *tq_ptr = (float *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } else { - double *tq_ptr = (double *)tq_pinned; + auto *tq_ptr = (double *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } } @@ -742,13 +742,12 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - auto fieldp_ptr = (float *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } else { auto field_ptr = (double *)fieldp_pinned; @@ -760,13 +759,12 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - auto fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } @@ -975,13 +973,12 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - auto fieldp_ptr = (float *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } else { auto field_ptr = (double *)fieldp_pinned; @@ -993,13 +990,12 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - auto fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } @@ -1301,10 +1297,10 @@ void PairAmoebaGPU::polar_real() // reference to the tep array from GPU lib if (tq_single) { - float *tep_ptr = (float *)tq_pinned; + auto *tep_ptr = (float *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } else { - double *tep_ptr = (double *)tq_pinned; + auto *tep_ptr = (double *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } } diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index a12a7e1907..5956f1bc11 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -26,7 +26,6 @@ #include "fix_store_peratom.h" #include "force.h" #include "gpu_extra.h" -#include "info.h" #include "math_const.h" #include "memory.h" #include "my_page.h" @@ -219,13 +218,9 @@ void PairHippoGPU::init_style() screen, polar_dscale, polar_uscale, tq_size); GPU_EXTRA::check_flag(success,error,world); - if (gpu_mode == GPU_FORCE) - error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now"); + if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now"); - if (tq_size == sizeof(double)) - tq_single = false; - else - tq_single = true; + tq_single = (tq_size == sizeof(float)); // replace with the gpu counterpart @@ -302,10 +297,10 @@ void PairHippoGPU::repulsion() // reference to the tep array from GPU lib if (tq_single) { - float *tq_ptr = (float *)tq_pinned; + auto *tq_ptr = (float *)tq_pinned; compute_force_from_torque(tq_ptr, f, virrepulse); // frepulse } else { - double *tq_ptr = (double *)tq_pinned; + auto *tq_ptr = (double *)tq_pinned; compute_force_from_torque(tq_ptr, f, virrepulse); // frepulse } } @@ -402,10 +397,10 @@ void PairHippoGPU::multipole_real() // reference to the tep array from GPU lib if (tq_single) { - float *tq_ptr = (float *)tq_pinned; + auto *tq_ptr = (float *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } else { - double *tq_ptr = (double *)tq_pinned; + auto *tq_ptr = (double *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } } @@ -860,13 +855,12 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - auto fieldp_ptr = (float *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } else { @@ -879,13 +873,12 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - auto fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } } @@ -1080,8 +1073,8 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) hippo_gpu_update_fieldp(&fieldp_pinned); int inum = atom->nlocal; - if (Info::has_accelerator_feature("GPU", "precision", "single")) { - float *field_ptr = (float *)fieldp_pinned; + if (tq_single) { + auto *field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { int idx = 4*i; @@ -1090,17 +1083,16 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - float* fieldp_ptr = (float *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } else { - double *field_ptr = (double *)fieldp_pinned; + auto *field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { int idx = 4*i; @@ -1109,13 +1101,12 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) field[i][2] += field_ptr[idx+2]; } - double* fieldp_ptr = (double *)fieldp_pinned; - fieldp_ptr += 4*inum; + field_ptr += 4*inum; for (int i = 0; i < nlocal; i++) { int idx = 4*i; - fieldp[i][0] += fieldp_ptr[idx]; - fieldp[i][1] += fieldp_ptr[idx+1]; - fieldp[i][2] += fieldp_ptr[idx+2]; + fieldp[i][0] += field_ptr[idx]; + fieldp[i][1] += field_ptr[idx+1]; + fieldp[i][2] += field_ptr[idx+2]; } } @@ -1426,10 +1417,10 @@ void PairHippoGPU::polar_real() // reference to the tep array from GPU lib if (tq_single) { - float *tep_ptr = (float *)tq_pinned; + auto *tep_ptr = (float *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } else { - double *tep_ptr = (double *)tq_pinned; + auto *tep_ptr = (double *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } } From 722e583b591633736e2beaa3b4a29809d190efc8 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 25 Jan 2023 05:22:49 -0500 Subject: [PATCH 179/181] use available introspection API to get accumulator data type. update name of flag. --- lib/gpu/lal_amoeba_ext.cpp | 5 +---- lib/gpu/lal_hippo_ext.cpp | 5 +---- src/GPU/pair_amoeba_gpu.cpp | 20 ++++++++++---------- src/GPU/pair_amoeba_gpu.h | 2 +- src/GPU/pair_hippo_gpu.cpp | 23 ++++++++++++----------- src/GPU/pair_hippo_gpu.h | 2 +- 6 files changed, 26 insertions(+), 31 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index fe3d4a26d8..995dfbe95f 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -41,8 +41,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double polar_dscale, const double polar_uscale, - int& tep_size) { + const double polar_dscale, const double polar_uscale) { AMOEBAMF.clear(); gpu_mode=AMOEBAMF.device->gpu_mode(); double gpu_split=AMOEBAMF.device->particle_split(); @@ -52,8 +51,6 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas int gpu_rank=AMOEBAMF.device->gpu_rank(); int procs_per_gpu=AMOEBAMF.device->procs_per_gpu(); - tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION); - AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu); bool message=false; diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index b5ac42744a..0cb00387ca 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -42,8 +42,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double polar_dscale, const double polar_uscale, - int& tep_size) { + const double polar_dscale, const double polar_uscale) { HIPPOMF.clear(); gpu_mode=HIPPOMF.device->gpu_mode(); double gpu_split=HIPPOMF.device->particle_split(); @@ -53,8 +52,6 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass int gpu_rank=HIPPOMF.device->gpu_rank(); int procs_per_gpu=HIPPOMF.device->procs_per_gpu(); - tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION); - HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu); bool message=false; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 941050cf04..fd423486fd 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -26,6 +26,7 @@ #include "fix_store_peratom.h" #include "force.h" #include "gpu_extra.h" +#include "info.h" #include "math_const.h" #include "memory.h" #include "my_page.h" @@ -66,7 +67,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double polar_dscale, const double polar_uscale, int& tq_size); + const double polar_dscale, const double polar_uscale); void amoeba_gpu_clear(); int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, @@ -188,7 +189,6 @@ void PairAmoebaGPU::init_style() maxspecial15=atom->maxspecial15; } - int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, pdamp, thole, dirdamp, amtype2class, special_hal, @@ -197,13 +197,13 @@ void PairAmoebaGPU::init_style() special_polar_pscale, csix, adisp, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, - polar_dscale, polar_uscale, tq_size); + polar_dscale, polar_uscale); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); - tq_single = (tq_size == sizeof(float)); + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); // replace with the gpu counterpart @@ -284,7 +284,7 @@ void PairAmoebaGPU::multipole_real() // reference to the tep array from GPU lib - if (tq_single) { + if (acc_float) { auto *tq_ptr = (float *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } else { @@ -732,7 +732,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (udirect1) int nlocal = atom->nlocal; - if (tq_single) { + if (acc_float) { auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { @@ -963,7 +963,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) amoeba_gpu_update_fieldp(&fieldp_pinned); int inum = atom->nlocal; - if (tq_single) { + if (acc_float) { auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { @@ -1161,7 +1161,7 @@ void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, &fdip_sum_phi_pinned); int nlocal = atom->nlocal; - if (tq_single) { + if (acc_float) { auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { int n = i; @@ -1296,7 +1296,7 @@ void PairAmoebaGPU::polar_real() // reference to the tep array from GPU lib - if (tq_single) { + if (acc_float) { auto *tep_ptr = (float *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } else { @@ -1492,7 +1492,7 @@ void PairAmoebaGPU::polar_kspace() } else { void* fphi_pinned = nullptr; amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); - if (tq_single) { + if (acc_float) { auto _fphi_ptr = (float *)fphi_pinned; for (int i = 0; i < nlocal; i++) { int idx = i; diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index c9b9b73a58..be53f7ef50 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -49,7 +49,7 @@ class PairAmoebaGPU : public PairAmoeba { double cpu_time; void *tq_pinned; void *fieldp_pinned; - bool tq_single; + bool acc_float; bool gpu_hal_ready; bool gpu_repulsion_ready; diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 5956f1bc11..9d286d5db7 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -26,6 +26,7 @@ #include "fix_store_peratom.h" #include "force.h" #include "gpu_extra.h" +#include "info.h" #include "math_const.h" #include "memory.h" #include "my_page.h" @@ -67,7 +68,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, - const double polar_dscale, const double polar_uscale, int& tq_size); + const double polar_dscale, const double polar_uscale); void hippo_gpu_clear(); int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, @@ -205,7 +206,6 @@ void PairHippoGPU::init_style() maxspecial15=atom->maxspecial15; } - int tq_size; int mnf = 5e-2 * neighbor->oneatom; int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass, pdamp, thole, dirdamp, amtype2class, @@ -215,12 +215,13 @@ void PairHippoGPU::init_style() csix, adisp, pcore, palpha, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, - screen, polar_dscale, polar_uscale, tq_size); + screen, polar_dscale, polar_uscale); GPU_EXTRA::check_flag(success,error,world); - if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now"); + if (gpu_mode == GPU_FORCE) + error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now"); - tq_single = (tq_size == sizeof(float)); + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); // replace with the gpu counterpart @@ -296,7 +297,7 @@ void PairHippoGPU::repulsion() // reference to the tep array from GPU lib - if (tq_single) { + if (acc_float) { auto *tq_ptr = (float *)tq_pinned; compute_force_from_torque(tq_ptr, f, virrepulse); // frepulse } else { @@ -396,7 +397,7 @@ void PairHippoGPU::multipole_real() // reference to the tep array from GPU lib - if (tq_single) { + if (acc_float) { auto *tq_ptr = (float *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } else { @@ -845,7 +846,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp) // field and fieldp may already have some nonzero values from kspace (udirect1) int nlocal = atom->nlocal; - if (tq_single) { + if (acc_float) { auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { @@ -1073,7 +1074,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp) hippo_gpu_update_fieldp(&fieldp_pinned); int inum = atom->nlocal; - if (tq_single) { + if (acc_float) { auto *field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { @@ -1279,7 +1280,7 @@ void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, &fdip_sum_phi_pinned); int nlocal = atom->nlocal; - if (tq_single) { + if (acc_float) { auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { int n = i; @@ -1416,7 +1417,7 @@ void PairHippoGPU::polar_real() // reference to the tep array from GPU lib - if (tq_single) { + if (acc_float) { auto *tep_ptr = (float *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } else { diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index 7955c97470..d160446d77 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -50,7 +50,7 @@ class PairHippoGPU : public PairAmoeba { double cpu_time; void *tq_pinned; void *fieldp_pinned; - bool tq_single; + bool acc_float; bool gpu_hal_ready; bool gpu_repulsion_ready; From 6fefd8821a96e0e24c6c05f7a82de37ea06b5222 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 25 Jan 2023 10:42:55 -0600 Subject: [PATCH 180/181] Attempted to allow GPU acceleration on MacOS with neighbor builds on the device by enforcing the old neighbor list code path (will revisit) --- lib/gpu/lal_neighbor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index 5b569f804a..482b93d9e5 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -33,7 +33,7 @@ #endif #endif -#if defined(USE_HIP) +#if defined(USE_HIP) || defined(__APPLE__) #define LAL_USE_OLD_NEIGHBOR #endif From 7e5e5c1b6f1704ac0834927cf20d938542ef7bdb Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Wed, 25 Jan 2023 13:30:29 -0600 Subject: [PATCH 181/181] Only added amoeba_convolution_gpu.* to the list of GPU source files when PKG_AMOEBA is on --- cmake/Modules/Packages/GPU.cmake | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 2c766a2540..24d9538206 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -3,9 +3,7 @@ set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h ${GPU_SOURCES_DIR}/fix_gpu.h ${GPU_SOURCES_DIR}/fix_gpu.cpp ${GPU_SOURCES_DIR}/fix_nh_gpu.h - ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp - ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h - ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp) + ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp) target_compile_definitions(lammps PRIVATE -DLMP_GPU) set(GPU_API "opencl" CACHE STRING "API used by GPU package") @@ -35,6 +33,12 @@ if(PKG_AMOEBA AND FFT_SINGLE) message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT") endif() +if (PKG_AMOEBA) + list(APPEND GPU_SOURCES + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h + ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp) +endif() + file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp) file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)