diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 0d78a8618a..8bcbd6c4cb 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -58,7 +58,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, - "k_amoeba_polar", "k_amoeba_udirect2b"); + "k_amoeba_polar", "k_amoeba_udirect2b", + "k_amoeba_umutual2b"); if (success!=0) return success; @@ -152,7 +153,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Calculate the real-space permanent field, returning field and fieldp // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { @@ -177,5 +178,31 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { return GX; } +// --------------------------------------------------------------------------- +// Calculate the real-space induced field, returning field and fieldp +// --------------------------------------------------------------------------- +template +int AmoebaT::umutual2b(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int _nall=this->atom->nall(); + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + + this->k_umutual2b.set_size(GX,BX); + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &_aewald, &_off2, + &_polar_dscale, &_polar_uscale); + + this->time_pair.stop(); + return GX; +} + template class Amoeba; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index c4f146a7c9..192f440112 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -907,6 +907,216 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp); } +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + __global numtyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp _fieldp[6]; + for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0; + + numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz; + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + //numtyp4 xi__; + + if (ii (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald); + + for ( ; nboroff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp rinv = ucl_recip(r); + numtyp r2inv = rinv*rinv; + numtyp rr1 = rinv; + numtyp rr3 = rr1 * r2inv; + numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; + numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; + + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale; + const numtyp4 sp_pol = sp_polar[sbmask15(jextra)]; + factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)]; + if (igroup == jgroup) { + factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)]; + factor_dscale = polar_dscale; + factor_uscale = polar_uscale; + } else { + factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)]; + factor_dscale = factor_uscale = (numtyp)1.0; + } + + // intermediates involving moments and separation distance + + numtyp dir = dix*xr + diy*yr + diz*zr; + numtyp qix = qixx*xr + qixy*yr + qixz*zr; + numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr; + numtyp qiz = qixz*xr + qiyz*yr + qizz*zr; + numtyp qir = qix*xr + qiy*yr + qiz*zr; + numtyp dkr = dkx*xr + dky*yr + dkz*zr; + numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr; + numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr; + numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr; + numtyp qkr = qkx*xr + qky*yr + qkz*zr; + + // calculate the real space Ewald error function terms + + numtyp ralpha = aewald * r; + numtyp exp2a = ucl_exp(-ralpha*ralpha); + numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha); + numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a; + //bn[0] = erfc(ralpha) / r; + bn[0] = _erfc * rinv; + + numtyp aefac = aesq2n; + for (int m = 1; m <= 3; m++) { + numtyp bfac = (numtyp) (m+m-1); + aefac = aesq2 * aefac; + bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv; + } + + // find the field components for Thole polarization damping + + numtyp scale3 = (numtyp)1.0; + numtyp scale5 = (numtyp)1.0; + numtyp scale7 = (numtyp)1.0; + numtyp damp = pdi * damping[jtype].x; // pdamp[jtype] + if (damp != (numtyp)0.0) { + numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype] + if (pgamma != (numtyp)0.0) { + damp = pgamma * ucl_powr(r/damp,(numtyp)1.5); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp) ; + scale3 = (numtyp)1.0 - expdamp ; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp); + } + } else { + pgamma = MIN(pti,damping[jtype].y); // thole[jtype] + damp = pgamma * ucl_powr(r/damp,3.0); + if (damp < (numtyp)50.0) { + numtyp expdamp = ucl_exp(-damp); + scale3 = (numtyp)1.0 - expdamp; + scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp); + scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp); + } + } + } else { // damp == 0: ??? + } + + numtyp scalek = factor_dscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + scalek = factor_pscale; + bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3; + bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5; + bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7; + fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx; + fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky; + fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz; + + _fieldp[0] += fid[0]; + _fieldp[1] += fid[1]; + _fieldp[2] += fid[2]; + _fieldp[3] += fip[0]; + _fieldp[4] += fip[1]; + _fieldp[5] += fip[2]; + } // nbor + + } // ii { protected: bool _allocated; - int polar_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); + int umutual2b(const int eflag, const int vflag); + int polar_real(const int eflag, const int vflag); + }; } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 9baa7b30d3..6bcd6c50c7 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -38,6 +38,7 @@ BaseAmoebaT::~BaseAmoeba() { delete nbor; k_polar.clear(); k_udirect2b.clear(); + k_umutual2b.clear(); k_special15.clear(); if (pair_program) delete pair_program; } @@ -55,7 +56,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, const char *k_name_polar, - const char *k_name_udirect2b) { + const char *k_name_udirect2b, + const char *k_name_umutual2b) { screen=_screen; int gpu_nbor=0; @@ -87,7 +89,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b); + compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -230,7 +232,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall, +void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -473,6 +475,75 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i return firstneigh; //nbor->host_jlist.begin()-host_start; } +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute the direct real space part +// of the induced field +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd, void** fieldp_ptr) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer extra data from the host + // and build the neighbor lists if needed + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + host_uind, host_uinp, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + // ------------------- Resize _fieldp array ------------------------ + + if (nall>_max_fieldp_size) { + _max_fieldp_size=static_cast(static_cast(nall)*1.10); + _fieldp.resize(_max_fieldp_size*8); + } + *fieldp_ptr=_fieldp.host.begin(); + + const int red_blocks=umutual2b(eflag,vflag); + + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) + + _fieldp.update_host(_max_fieldp_size*8,false); +/* + printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", + this->_fieldp.cols(), _max_fieldp_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); + printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ + return firstneigh; //nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- @@ -551,7 +622,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const return firstneigh; // nbor->host_jlist.begin()-host_start; } - template double BaseAmoebaT::host_memory_usage_atomic() const { return device->atom.host_memory_usage()+nbor->host_memory_usage()+ @@ -621,7 +691,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_polar, - const char *kname_udirect2b) { + const char *kname_udirect2b, + const char *kname_umutual2b) { if (_compiled) return; @@ -632,6 +703,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_polar.set_function(*pair_program,kname_polar); k_udirect2b.set_function(*pair_program,kname_udirect2b); + k_umutual2b.set_function(*pair_program,kname_umutual2b); k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 7d4f4c00b5..3fb752c97c 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -54,7 +54,8 @@ class BaseAmoeba { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, - const char *kname_polar, const char *kname_udirect2b); + const char *kname_polar, const char *kname_udirect2b, + const char *kname_umutual2b); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -140,15 +141,31 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - /// Compute polar real-space with host neighboring (not active for now) - void compute_polar_real(const int f_ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double *charge, - const int nlocal, double *boxlo, double *prd, void **tep_ptr); + /// Compute the real space part of the permanent field (udirect2b) with device neighboring + int** compute_udirect2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, void **fieldp_ptr); + + /// Compute the real space part of the induced field (umutual2b) with device neighboring + int** compute_umutual2b(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, + double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd, void **fieldp_ptr); /// Compute polar real-space with device neighboring int** compute_polar_real(const int ago, const int inum_full, const int nall, @@ -162,18 +179,15 @@ class BaseAmoeba { int **ilist, int **numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd, void **tep_ptr); - /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring - int** compute_udirect2b(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success, - double *charge, double *boxlo, double *prd, void **fieldp_ptr); + /// Compute polar real-space with host neighboring (not active for now) + void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *charge, + const int nlocal, double *boxlo, double *prd, void **tep_ptr); // -------------------------- DEVICE DATA ------------------------- @@ -224,7 +238,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_polar, k_udirect2b, k_special15; + UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -241,10 +255,13 @@ class BaseAmoeba { UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *kname_polar, const char *kname_udirect2b); + const char *kname_polar, const char *kname_udirect2b, + const char *kname_umutual2b); - virtual int polar_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; + virtual int umutual2b(const int eflag, const int vflag) = 0; + virtual int polar_real(const int eflag, const int vflag) = 0; + }; } diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 0ec601de47..b2318d296e 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -365,9 +365,9 @@ class PairAmoeba : public Pair { void ulspred(); void ufield0c(double **, double **); void uscale0b(int, double **, double **, double **, double **); - virtual void dfield0c(double **, double **); + void dfield0c(double **, double **); void umutual1(double **, double **); - void umutual2b(double **, double **); + virtual void umutual2b(double **, double **); void udirect1(double **); virtual void udirect2b(double **, double **); void dampmut(double, double, double, double *); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3280e7b093..a1c21da3dd 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -74,7 +74,18 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd, void **fieldp_ptr); - +/* +int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *host_amtype, int *host_amgroup, + double **host_rpole, double **host_uind, double **host_uinp, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int* nspecial15, tagint** special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd, + void **fieldp_ptr); +*/ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -100,6 +111,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) tep_pinned = nullptr; gpu_udirect2b_ready = true; + gpu_umutual2b_ready = false; gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); @@ -142,16 +154,7 @@ void PairAmoebaGPU::polar_real() domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } inum = atom->nlocal; -/* - if (comm->me == 0) { - printf("GPU: polar real\n"); - for (int i = 0; i < 20; i++) { - printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n", - i, uind[i][0], uind[i][1], uind[i][2], - uinp[i][0], uinp[i][1], uinp[i][2]); - } - } -*/ + firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, uind, uinp, sublo, subhi, @@ -993,6 +996,74 @@ void PairAmoebaGPU::udirect2b_cpu() } } +/* ---------------------------------------------------------------------- + umutual2b = Ewald real mutual field via list + umutual2b computes the real space contribution of the induced + atomic dipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::umutual2b(double **field, double **fieldp) +{ + if (!gpu_umutual2b_ready) { + PairAmoeba::umutual2b(field, fieldp); + return; + } +/* + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, uind, uinp, + sublo, subhi, atom->tag, atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, + domain->prd, &fieldp_pinned); + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + // accumulate the field and fieldp values from the GPU lib + // field and fieldp may already have some nonzero values from kspace (udirect1) + + int nlocal = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] += field_ptr[idx]; + field[i][1] += field_ptr[idx+1]; + field[i][2] += field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] += fieldp_ptr[idx]; + fieldp[i][1] += fieldp_ptr[idx+1]; + fieldp[i][2] += fieldp_ptr[idx+2]; + } +*/ +} + /* ---------------------------------------------------------------------- */ double PairAmoebaGPU::memory_usage() diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index d0cbad90a2..4dc547e469 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -37,6 +37,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void polar_real(); virtual void udirect2b(double **, double **); + virtual void umutual2b(double **, double **); private: int gpu_mode; @@ -45,8 +46,9 @@ class PairAmoebaGPU : public PairAmoeba { void *fieldp_pinned; bool tep_single; - bool gpu_polar_real_ready; bool gpu_udirect2b_ready; + bool gpu_umutual2b_ready; + bool gpu_polar_real_ready; void udirect2b_cpu();