diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index 6b0b563d9f..f1e0320b8c 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff,"k_tersoff_repulsive", - "k_tersoff_three_center", "k_tersoff_three_end"); + "k_tersoff_three_center", "k_tersoff_three_end", + "k_tersoff_short_nbor"); if (success!=0) return success; @@ -157,8 +158,12 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int UCL_H_Vec cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } + _cutshortsq = static_cast(cutsqmax); cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); @@ -250,7 +255,7 @@ void TersoffT::compute(const int f_ago, const int inum_full, const int nall, this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); + this->_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); } this->atom->cast_x_data(host_x,host_type); @@ -258,29 +263,19 @@ void TersoffT::compute(const int f_ago, const int inum_full, const int nall, this->atom->add_x_data(host_x,host_type); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } + this->_ainum=nlist; + int _eflag; if (eflag) _eflag=1; else _eflag=0; - int ainum=nlist; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - int evatom=0; if (eatom || vatom) evatom=1; @@ -329,7 +324,7 @@ int ** TersoffT::compute(const int ago, const int inum_full, // Build neighbor list on GPU if necessary if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; @@ -343,29 +338,19 @@ int ** TersoffT::compute(const int ago, const int inum_full, *jnum=this->nbor->host_acc.begin(); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast(static_cast(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } + this->_ainum=nall; + int _eflag; if (eflag) _eflag=1; else _eflag=0; - int ainum=nall; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast(ceil(static_cast(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - int evatom=0; if (eatom || vatom) evatom=1; @@ -402,9 +387,32 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int ainum=this->ans->inum(); + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ + int GX=static_cast(ceil(static_cast(ainum)/ + (BX/this->_threads_per_atom))); + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &_cutshortsq, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->_ainum)/ + (BX/(JTHREADS*KTHREADS)))); + + this->k_zeta.set_size(GX,BX); + this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, + &map, &elem2param, &_nelements, &_nparams, &_zetaij, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); + + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -423,6 +431,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -437,7 +446,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -446,7 +455,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index b7d48d9e34..d132545984 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -164,6 +164,57 @@ texture ts5_tex; #endif +__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, + const numtyp cutshortsq, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - - int _max_nbors; + numtyp _cutshortsq; private: bool _allocated;