diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h index b40bb7f943..47a216ff6f 100644 --- a/lib/gpu/lal_aux_fun1.h +++ b/lib/gpu/lal_aux_fun1.h @@ -22,21 +22,21 @@ offset=tid & (t_per_atom-1); \ ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; -#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \ - i, numj, stride, nbor_end, nbor_begin) \ - i=nbor_mem[ii]; \ - nbor_begin=ii+nbor_stride; \ - numj=nbor_mem[nbor_begin]; \ - if (nbor_mem==packed_mem) { \ - nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \ - stride=fast_mul(t_per_atom,nbor_stride); \ - nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \ +#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \ + i, numj, n_stride, nbor_end, nbor_begin) \ + i=dev_nbor[ii]; \ + nbor_begin=ii+nbor_pitch; \ + numj=dev_nbor[nbor_begin]; \ + if (dev_nbor==dev_packed) { \ + nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \ + n_stride=fast_mul(t_per_atom,nbor_pitch); \ + nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \ nbor_begin+=offset; \ } else { \ - nbor_begin+=nbor_stride; \ - nbor_begin=nbor_mem[nbor_begin]; \ + nbor_begin+=nbor_pitch; \ + nbor_begin=dev_nbor[nbor_begin]; \ nbor_end=nbor_begin+numj; \ - stride=t_per_atom; \ + n_stride=t_per_atom; \ nbor_begin+=offset; \ } diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index f772e36295..fd9fc7f272 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -20,7 +20,7 @@ using namespace LAMMPS_AL; extern Device global_device; template -BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { +BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _short_nbor(false) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); @@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_two, const char *k_three_center, - const char *k_three_end) { + const char *two, const char *three_center, + const char *three_end, const char *short_nbor) { screen=_screen; int gpu_nbor=0; @@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_atom(); - if (_threads_per_atom>1 && gpu_nbor==0) { + if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1 nbor->packing(true); _nbor_data=&(nbor->dev_packed); - } else + } else // neigh yes or tpa == 1 _nbor_data=&(nbor->dev_nbor); if (_threads_per_atom*_threads_per_atom>device->warp_size()) return -10; @@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _block_pair=device->pair_block_size(); _block_size=device->block_ellipse(); - compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end); + compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor); // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -113,6 +113,15 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _max_an_bytes+=ans2->gpu_bytes(); #endif + // if short neighbor list is supported + if (short_nbor) { + _short_nbor = true; + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); + } + return 0; } @@ -136,6 +145,7 @@ void BaseThreeT::clear_atomic() { k_three_end.clear(); k_three_end_vatom.clear(); k_pair.clear(); + k_short_nbor.clear(); delete pair_program; _compiled=false; } @@ -143,6 +153,7 @@ void BaseThreeT::clear_atomic() { time_pair.clear(); hd_balancer.clear(); + dev_short_nbor.clear(); nbor->clear(); ans->clear(); #ifdef THREE_CONCURRENT @@ -247,12 +258,26 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; + _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist); } atom->cast_x_data(host_x,host_type); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); + // if short neighbor list is supported + if (_short_nbor) { + + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + } + + // _ainum to be used in loop() for short neighbor list build + _ainum = nlist; + int evatom=0; if (eatom || vatom) evatom=1; @@ -300,7 +325,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, // Build neighbor list on GPU if necessary if (ago==0) { - build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; @@ -313,6 +338,19 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, *ilist=nbor->host_ilist.begin(); *jnum=nbor->host_acc.begin(); + // if short neighbor list is supported + if (_short_nbor) { + + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + } + + // _ainum to be used in loop() for short neighbor list build + _ainum = nall; + int evatom=0; if (eatom || vatom) evatom=1; @@ -339,19 +377,20 @@ double BaseThreeT::host_memory_usage_atomic() const { template void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *ktwo, const char *kthree_center, - const char *kthree_end) { + const char *two, const char *three_center, + const char *three_end, const char* short_nbor) { if (_compiled) return; - std::string vatom_name=std::string(kthree_end)+"_vatom"; + std::string vatom_name=std::string(three_end)+"_vatom"; pair_program=new UCL_Program(dev); pair_program->load_string(pair_str,device->compile_string().c_str()); - k_three_center.set_function(*pair_program,kthree_center); - k_three_end.set_function(*pair_program,kthree_end); + k_three_center.set_function(*pair_program,three_center); + k_three_end.set_function(*pair_program,three_end); k_three_end_vatom.set_function(*pair_program,vatom_name.c_str()); - k_pair.set_function(*pair_program,ktwo); + k_pair.set_function(*pair_program,two); + if (short_nbor) k_short_nbor.set_function(*pair_program,short_nbor); pos_tex.get_texture(*pair_program,"pos_tex"); #ifdef THREE_CONCURRENT diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h index 4f27ecdf92..d03a7521cd 100644 --- a/lib/gpu/lal_base_three.h +++ b/lib/gpu/lal_base_three.h @@ -56,7 +56,8 @@ class BaseThree { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *k_two, - const char *k_three_center, const char *k_three_end); + const char *k_three_center, const char *k_three_end, + const char *k_short_nbor=NULL); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -74,8 +75,8 @@ class BaseThree { /// Check if there is enough storage for neighbors and realloc if not /** \param nlocal number of particles whose nbors must be stored on device - * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors + * \param max_nbors maximum number of neighbors + * \param success set to false if insufficient memory * \note olist_size=total number of local particles **/ inline void resize_local(const int inum, const int max_nbors, bool &success) { nbor->resize(inum,max_nbors,success); @@ -84,7 +85,7 @@ class BaseThree { /// Check if there is enough storage for neighbors and realloc if not /** \param nlocal number of particles whose nbors must be stored on device * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors + * \param max_nbors current maximum number of neighbors * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ @@ -143,14 +144,6 @@ class BaseThree { const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); - /// Pair loop with device neighboring - int * compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); - /// Pair loop with device neighboring int ** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, @@ -193,6 +186,9 @@ class BaseThree { /// Neighbor data Neighbor *nbor; + UCL_D_Vec dev_short_nbor; + UCL_Kernel k_short_nbor; + // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom; @@ -203,16 +199,17 @@ class BaseThree { UCL_Texture pos_tex; protected: - bool _compiled; + bool _compiled,_short_nbor; int _block_pair, _block_size, _threads_per_atom, _end_command_queue; int _gpu_nbor; double _max_bytes, _max_an_bytes; + int _max_nbors, _ainum; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *k_two, const char *k_three_center, - const char *k_three_end); + const char *two, const char *three_center, + const char *three_end, const char* short_nbor); virtual void loop(const bool _eflag, const bool _vflag, const int evatom) = 0; diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index 3492d7030e..24984e4878 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_ int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,sw,"k_sw","k_sw_three_center", - "k_sw_three_end"); + "k_sw_three_end","k_sw_short_nbor"); if (success!=0) return success; @@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + // build the short neighbor list + int ainum=this->_ainum; + int nbor_pitch=this->nbor->nbor_pitch(); + int GX=static_cast(ceil(static_cast(ainum)/ (BX/this->_threads_per_atom))); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); this->time_pair.start(); - + this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 46330c59e4..7dea52898e 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -130,6 +130,64 @@ texture sw3_tex; #endif +__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict sw3, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (iiinit_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,vashishta,"k_vashishta","k_vashishta_three_center", - "k_vashishta_three_end"); + "k_vashishta_three_end","k_vashishta_short_nbor"); if (success!=0) return success; @@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); + double r0sqmax = 0; for (int i=0; i(r0sq); dview[i].y=static_cast(gamma[i]); dview[i].z=static_cast(cutsq[i]); dview[i].w=static_cast(r0[i]); } + _cutshortsq = static_cast(r0sqmax); + ucl_copy(param4,dview,false); param4_tex.get_texture(*(this->pair_program),"param4_tex"); param4_tex.bind_float(param4,4); @@ -223,15 +226,27 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + // build the short neighbor list + int ainum=this->_ainum; + int nbor_pitch=this->nbor->nbor_pitch(); + int GX=static_cast(ceil(static_cast(ainum)/ (BX/this->_threads_per_atom))); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &_cutshortsq, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); this->time_pair.start(); + // note that k_pair does not run with the short neighbor list this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, @@ -248,6 +263,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); Answer *end_ans; @@ -257,21 +273,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { end_ans=this->ans; #endif if (evatom!=0) { - this->k_three_end_vatom.set_size(GX,BX); this->k_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); this->k_three_end.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu index caa3c03613..7449b18f6b 100644 --- a/lib/gpu/lal_vashishta.cu +++ b/lib/gpu/lal_vashishta.cu @@ -136,6 +136,56 @@ texture param5_tex; #endif +__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_, + const numtyp cutshortsq, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii { UCL_D_Vec elem2param; UCL_D_Vec map; int _nparams,_nelements; + numtyp _cutshortsq; UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;