diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin index e02d849ba4..2491493c45 100644 --- a/examples/amoeba/in.ubiquitin +++ b/examples/amoeba/in.ubiquitin @@ -4,7 +4,7 @@ units real boundary p p p atom_style amoeba - +#atom_modify sort 1000 7.0 bond_style class2 angle_style amoeba dihedral_style none diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index c7b4872db0..0d78a8618a 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -57,7 +57,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda const double polar_uscale) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, - cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar"); + cell_size,gpu_split,_screen,amoeba, + "k_amoeba_polar", "k_amoeba_udirect2b"); if (success!=0) return success; @@ -164,15 +165,14 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); -/* - this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->ans->force, &this->ans->engv, &this->_tep, - &eflag, &vflag, &ainum, &_nall, &nbor_pitch, - &this->_threads_per_atom, - &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); -*/ + + this->k_udirect2b.set_size(GX,BX); + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->_fieldp, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &_aewald, &_off2, + &_polar_dscale, &_polar_uscale); + this->time_pair.stop(); return GX; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 3d28939d42..adcff0e648 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -91,8 +91,8 @@ _texture( q_tex,int2); tep[i]=t; \ } -#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \ - i, field, fieldp) \ +#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i, \ + fieldp) \ if (t_per_atom>1) { \ red_acc[0][tid]=_fieldp[0]; \ red_acc[1][tid]=_fieldp[1]; \ @@ -118,8 +118,8 @@ _texture( q_tex,int2); numtyp4 f, fp; \ f.x = _fieldp[0]; f.y = _fieldp[0]; f.z = _fieldp[2]; \ fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5]; \ - field[i] = f; \ - fieldp[i] = fp; \ + fieldp[ii] = f; \ + fieldp[ii+inum] = fp; \ } #else @@ -152,8 +152,8 @@ _texture( q_tex,int2); tep[i]=t; \ } -#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, \ - i, field, fieldp) \ +#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i, \ + fieldp) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom); \ @@ -168,8 +168,8 @@ _texture( q_tex,int2); numtyp4 f, fp; \ f.x = _fieldp[0]; f.y = _fieldp[0]; f.z = _fieldp[2]; \ fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5]; \ - field[i] = f; \ - fieldp[i] = fp; \ + fieldp[ii] = f; \ + fieldp[ii+inum] = fp; \ } #endif @@ -177,6 +177,11 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + polar_real = real-space portion of induced dipole polarization + adapted from Tinker epreal1d() routine +------------------------------------------------------------------------- */ + __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict damping, @@ -468,7 +473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0]; term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr; numtyp tixx = ci*term3 + dix*term4 + dir*term5 + - (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6; + (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6; numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 + (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6; @@ -684,19 +689,23 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, offset,eflag,vflag,ans,engv); } +/* ---------------------------------------------------------------------- + udirect2b = Ewald real direct field via list + udirect2b computes the real space contribution of the permanent + atomic multipole moments to the field via a neighbor list +------------------------------------------------------------------------- */ + __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, - const __global numtyp *restrict extra, - const __global numtyp4 *restrict damping, - const __global numtyp4 *restrict sp_polar, - const __global int *dev_nbor, - const __global int *dev_packed, - __global numtyp4 *restrict field, - __global numtyp4 *restrict fieldp, - const int eflag, const int vflag, const int inum, - const int nall, const int nbor_pitch, const int t_per_atom, - const numtyp aewald, const numtyp felec, - const numtyp off2, const numtyp polar_dscale, - const numtyp polar_uscale) + const __global numtyp *restrict extra, + const __global numtyp4 *restrict damping, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + __global numtyp4 *restrict fieldp, + const int inum, const int nall, + const int nbor_pitch, const int t_per_atom, + const numtyp aewald, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -771,7 +780,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; - numtyp rr1 = felec * rinv; + numtyp rr1 = rinv; numtyp rr3 = rr1 * r2inv; numtyp rr5 = (numtyp)3.0 * rr3 * r2inv; numtyp rr7 = (numtyp)5.0 * rr5 * r2inv; @@ -888,7 +897,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, // accumulate field and fieldp - store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,field,fieldp); + store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp); } /* ---------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 0c9a422cec..a1cf516777 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() { delete ans; delete nbor; k_polar.clear(); + k_udirect2b.clear(); k_special15.clear(); if (pair_program) delete pair_program; } @@ -53,7 +54,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name) { + const char *k_name_polar, + const char *k_name_udirect2b) { screen=_screen; int gpu_nbor=0; @@ -85,7 +87,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name); + compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -118,9 +120,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, if (ef_nall==0) ef_nall=2000; - _max_alloc_size=static_cast(static_cast(ef_nall)*1.10); - _fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _max_tep_size=static_cast(static_cast(ef_nall)*1.10); + _max_fieldp_size = _max_tep_size; + _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); @@ -224,7 +227,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, +void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -252,9 +255,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, // ------------------- Resize _tep array ------------------------ - if (nall>_max_alloc_size) { - _max_alloc_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_alloc_size*4); + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); dev_nspecial15.clear(); dev_special15.clear(); @@ -302,6 +305,10 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall, ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); device->add_ans_object(ans); hd_balancer.stop_timer(); + + // copy tep from device to host + + _tep.update_host(_max_tep_size*4,false); } // --------------------------------------------------------------------------- @@ -338,9 +345,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const // ------------------- Resize _tep array ------------------------ - if (nall>_max_alloc_size) { - _max_alloc_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_alloc_size*4); + if (nall>_max_tep_size) { + _max_tep_size=static_cast(static_cast(nall)*1.10); + _tep.resize(_max_tep_size*4); dev_nspecial15.clear(); dev_special15.clear(); @@ -397,9 +404,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const // copy tep from device to host - _tep.update_host(_max_alloc_size*4,false); + _tep.update_host(_max_tep_size*4,false); /* - printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size); + printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size); for (int i = 0; i < 10; i++) { numtyp4* p = (numtyp4*)(&this->_tep[4*i]); printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z); @@ -442,9 +449,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i // ------------------- Resize _fieldp array ------------------------ - if (nall>_max_alloc_size) { - _max_alloc_size=static_cast(static_cast(nall)*1.10); - _fieldp.resize(_max_alloc_size*8); + if (nall>_max_fieldp_size) { + _max_fieldp_size=static_cast(static_cast(nall)*1.10); + _fieldp.resize(_max_fieldp_size*8); dev_nspecial15.clear(); dev_special15.clear(); @@ -492,13 +499,18 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i *jnum=nbor->host_acc.begin(); const int red_blocks=udirect2b(eflag,vflag); - //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //device->add_ans_object(ans); hd_balancer.stop_timer(); - // copy field and fieldp from device to host + // copy field and fieldp from device to host (_fieldp store both arrays, one after another) - //_fieldp.update_host(_max_field_size*8,false); + _fieldp.update_host(_max_fieldp_size*8,false); +/* + printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_field.cols(), _max_fieldp_size); + for (int i = 0; i < 10; i++) { + numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]); + printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z); + } +*/ return nbor->host_jlist.begin()-host_start; } @@ -566,7 +578,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname) { + const char *kname_polar, + const char *kname_udirect2b) { if (_compiled) return; @@ -575,7 +588,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, std::string oclstring = device->compile_string()+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); - k_polar.set_function(*pair_program,kname); + k_polar.set_function(*pair_program,kname_polar); + k_udirect2b.set_function(*pair_program,kname_udirect2b); k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); @@ -593,6 +607,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, } +// --------------------------------------------------------------------------- +// Specify 1-5 neighbors from the current neighbor list +// --------------------------------------------------------------------------- + template int BaseAmoebaT::add_onefive_neighbors() { // Compute the block size and grid size to keep all cores busy diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 7ef94c776e..ae0f33ef29 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -53,8 +53,8 @@ class BaseAmoeba { * - -5 Double precision is not supported on card **/ int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, - const double gpu_split, FILE *screen, - const void *pair_program, const char *k_name); + const double gpu_split, FILE *screen, const void *pair_program, + const char *kname_polar, const char *kname_udirect2b); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -129,7 +129,7 @@ class BaseAmoeba { bool &success); /// Compute polar real-space with host neighboring (not active for now) - void compute(const int f_ago, const int inum_full, const int nall, + void compute_polar_real(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, int *ilist, int *numj, @@ -190,8 +190,8 @@ class BaseAmoeba { double** uind, double** uinp); /// Per-atom arrays - UCL_Vector _tep,_fieldp; - int _max_alloc_size; + UCL_Vector _tep, _fieldp; + int _max_tep_size, _max_fieldp_size; // ------------------------ FORCE/ENERGY DATA ----------------------- @@ -210,7 +210,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_polar,k_special15; + UCL_Kernel k_polar, k_udirect2b, k_special15; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -226,7 +226,8 @@ class BaseAmoeba { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *kname_polar, const char *kname_udirect2b); virtual int polar_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 3cdaa25633..a5cc86e39d 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -298,7 +298,7 @@ void PairAmoebaGPU::init_style() void PairAmoebaGPU::udirect2b(double **field, double **fieldp) { - bool gpu_udirect2b_ready = false; + bool gpu_udirect2b_ready = true; if (!gpu_udirect2b_ready) { PairAmoeba::udirect2b(field, fieldp); return; @@ -334,7 +334,28 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) domain->prd, &fieldp_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); - + + // get field and fieldp values from the GPU lib + + int nlocal = atom->nlocal; + double *field_ptr = (double *)fieldp_pinned; + + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + field[i][0] = field_ptr[idx]; + field[i][1] = field_ptr[idx+1]; + field[i][2] = field_ptr[idx+2]; + } + + double* fieldp_ptr = (double *)fieldp_pinned; + fieldp_ptr += 4*inum; + for (int i = 0; i < nlocal; i++) { + int idx = 4*i; + fieldp[i][0] = fieldp_ptr[idx]; + fieldp[i][1] = fieldp_ptr[idx+1]; + fieldp[i][2] = fieldp_ptr[idx+2]; + } + // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs diff --git a/src/atom.cpp b/src/atom.cpp index 4ad5110ec9..86e2b1151b 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -2274,7 +2274,6 @@ void Atom::setup_sort_bins() #ifdef LMP_GPU if (userbinsize == 0.0) { int ifix = modify->find_fix("package_gpu"); -/* if (ifix >= 0) { const double subx = domain->subhi[0] - domain->sublo[0]; const double suby = domain->subhi[1] - domain->sublo[1]; @@ -2298,7 +2297,6 @@ void Atom::setup_sort_bins() bininvy = bininv; bininvz = bininv; } -*/ } #endif