Supported short neighbor lists for 3-body kernels in sw/gpu and vashishta/gpu

This commit is contained in:
Trung Nguyen
2017-07-07 16:47:24 -05:00
parent 4339379948
commit 68206079da
8 changed files with 332 additions and 95 deletions

View File

@ -22,21 +22,21 @@
offset=tid & (t_per_atom-1); \ offset=tid & (t_per_atom-1); \
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \ #define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
i, numj, stride, nbor_end, nbor_begin) \ i, numj, n_stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \ i=dev_nbor[ii]; \
nbor_begin=ii+nbor_stride; \ nbor_begin=ii+nbor_pitch; \
numj=nbor_mem[nbor_begin]; \ numj=dev_nbor[nbor_begin]; \
if (nbor_mem==packed_mem) { \ if (dev_nbor==dev_packed) { \
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \ nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \ n_stride=fast_mul(t_per_atom,nbor_pitch); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \ nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
nbor_begin+=offset; \ nbor_begin+=offset; \
} else { \ } else { \
nbor_begin+=nbor_stride; \ nbor_begin+=nbor_pitch; \
nbor_begin=nbor_mem[nbor_begin]; \ nbor_begin=dev_nbor[nbor_begin]; \
nbor_end=nbor_begin+numj; \ nbor_end=nbor_begin+numj; \
stride=t_per_atom; \ n_stride=t_per_atom; \
nbor_begin+=offset; \ nbor_begin+=offset; \
} }

View File

@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
extern Device<PRECISION,ACC_PRECISION> global_device; extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _short_nbor(false) {
device=&global_device; device=&global_device;
ans=new Answer<numtyp,acctyp>(); ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor(); nbor=new Neighbor();
@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
const int max_nbors, const int maxspecial, const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split, const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program, FILE *_screen, const void *pair_program,
const char *k_two, const char *k_three_center, const char *two, const char *three_center,
const char *k_three_end) { const char *three_end, const char *short_nbor) {
screen=_screen; screen=_screen;
int gpu_nbor=0; int gpu_nbor=0;
@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_gpu_host=1; _gpu_host=1;
_threads_per_atom=device->threads_per_atom(); _threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==0) { if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
nbor->packing(true); nbor->packing(true);
_nbor_data=&(nbor->dev_packed); _nbor_data=&(nbor->dev_packed);
} else } else // neigh yes or tpa == 1
_nbor_data=&(nbor->dev_nbor); _nbor_data=&(nbor->dev_nbor);
if (_threads_per_atom*_threads_per_atom>device->warp_size()) if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10; return -10;
@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_block_pair=device->pair_block_size(); _block_pair=device->pair_block_size();
_block_size=device->block_ellipse(); _block_size=device->block_ellipse();
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end); compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
// Initialize host-device load balancer // Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split); hd_balancer.init(device,gpu_nbor,gpu_split);
@ -113,6 +113,15 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_max_an_bytes+=ans2->gpu_bytes(); _max_an_bytes+=ans2->gpu_bytes();
#endif #endif
// if short neighbor list is supported
if (short_nbor) {
_short_nbor = true;
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
}
return 0; return 0;
} }
@ -136,6 +145,7 @@ void BaseThreeT::clear_atomic() {
k_three_end.clear(); k_three_end.clear();
k_three_end_vatom.clear(); k_three_end_vatom.clear();
k_pair.clear(); k_pair.clear();
k_short_nbor.clear();
delete pair_program; delete pair_program;
_compiled=false; _compiled=false;
} }
@ -143,6 +153,7 @@ void BaseThreeT::clear_atomic() {
time_pair.clear(); time_pair.clear();
hd_balancer.clear(); hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear(); nbor->clear();
ans->clear(); ans->clear();
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT
@ -247,12 +258,26 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success) if (!success)
return; return;
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
} }
atom->cast_x_data(host_x,host_type); atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer(); hd_balancer.start_timer();
atom->add_x_data(host_x,host_type); atom->add_x_data(host_x,host_type);
// if short neighbor list is supported
if (_short_nbor) {
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nlist;
int evatom=0; int evatom=0;
if (eatom || vatom) if (eatom || vatom)
evatom=1; evatom=1;
@ -300,7 +325,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success); sublo, subhi, tag, nspecial, special, success);
if (!success) if (!success)
return NULL; return NULL;
@ -313,6 +338,19 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
*ilist=nbor->host_ilist.begin(); *ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin(); *jnum=nbor->host_acc.begin();
// if short neighbor list is supported
if (_short_nbor) {
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nall;
int evatom=0; int evatom=0;
if (eatom || vatom) if (eatom || vatom)
evatom=1; evatom=1;
@ -339,19 +377,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *ktwo, const char *kthree_center, const char *two, const char *three_center,
const char *kthree_end) { const char *three_end, const char* short_nbor) {
if (_compiled) if (_compiled)
return; return;
std::string vatom_name=std::string(kthree_end)+"_vatom"; std::string vatom_name=std::string(three_end)+"_vatom";
pair_program=new UCL_Program(dev); pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str()); pair_program->load_string(pair_str,device->compile_string().c_str());
k_three_center.set_function(*pair_program,kthree_center); k_three_center.set_function(*pair_program,three_center);
k_three_end.set_function(*pair_program,kthree_end); k_three_end.set_function(*pair_program,three_end);
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str()); k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
k_pair.set_function(*pair_program,ktwo); k_pair.set_function(*pair_program,two);
if (short_nbor) k_short_nbor.set_function(*pair_program,short_nbor);
pos_tex.get_texture(*pair_program,"pos_tex"); pos_tex.get_texture(*pair_program,"pos_tex");
#ifdef THREE_CONCURRENT #ifdef THREE_CONCURRENT

View File

@ -56,7 +56,8 @@ class BaseThree {
const int maxspecial, const double cell_size, const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen, const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two, const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end); const char *k_three_center, const char *k_three_end,
const char *k_short_nbor=NULL);
/// Estimate the overhead for GPU context changes and CPU driver /// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(); void estimate_gpu_overhead();
@ -74,8 +75,8 @@ class BaseThree {
/// Check if there is enough storage for neighbors and realloc if not /// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device /** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host * \param max_nbors maximum number of neighbors
* \param current maximum number of neighbors * \param success set to false if insufficient memory
* \note olist_size=total number of local particles **/ * \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) { inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success); nbor->resize(inum,max_nbors,success);
@ -84,7 +85,7 @@ class BaseThree {
/// Check if there is enough storage for neighbors and realloc if not /// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device /** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host * \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors * \param max_nbors current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring * \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles * \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/ * \note olist_size=0 **/
@ -143,14 +144,6 @@ class BaseThree {
const bool vflag, const bool eatom, const bool vatom, const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success); int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
/// Pair loop with device neighboring /// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full, int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo, const int nall, double **host_x, int *host_type, double *sublo,
@ -193,6 +186,9 @@ class BaseThree {
/// Neighbor data /// Neighbor data
Neighbor *nbor; Neighbor *nbor;
UCL_D_Vec<int> dev_short_nbor;
UCL_Kernel k_short_nbor;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program; UCL_Program *pair_program;
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom; UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@ -203,16 +199,17 @@ class BaseThree {
UCL_Texture pos_tex; UCL_Texture pos_tex;
protected: protected:
bool _compiled; bool _compiled,_short_nbor;
int _block_pair, _block_size, _threads_per_atom, _end_command_queue; int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor; int _gpu_nbor;
double _max_bytes, _max_an_bytes; double _max_bytes, _max_an_bytes;
int _max_nbors, _ainum;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data; UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *k_two, const char *k_three_center, const char *two, const char *three_center,
const char *k_three_end); const char *three_end, const char* short_nbor);
virtual void loop(const bool _eflag, const bool _vflag, virtual void loop(const bool _eflag, const bool _vflag,
const int evatom) = 0; const int evatom) = 0;

View File

@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
int success; int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,sw,"k_sw","k_sw_three_center", _screen,sw,"k_sw","k_sw_three_center",
"k_sw_three_end"); "k_sw_three_end","k_sw_short_nbor");
if (success!=0) if (success!=0)
return success; return success;
@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ // build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum(); ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch(); nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start(); this->time_pair.start();
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch, &eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom); &this->_threads_per_atom);
@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom); &nbor_pitch, &this->_threads_per_atom, &evatom);
@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc, &this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc, &this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

View File

@ -130,6 +130,64 @@ texture<int4> sw3_tex;
#endif #endif
__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch, const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_sw(const __global numtyp4 *restrict x_, __kernel void k_sw(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1, const __global numtyp4 *restrict sw1,
@ -140,6 +198,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
const int nelements, const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int inum, const int eflag, const int vflag, const int inum,
@ -167,9 +226,14 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
itype=map[itype]; itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_short_nbor[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -337,6 +401,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const int nelements, const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -371,9 +436,15 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
itype=map[itype]; itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
int nborj_start = nbor_j;
nbor_end = nbor_j+fast_mul(numj,n_stride);
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j]; int j=dev_short_nbor[nbor_j];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -395,14 +466,16 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x; sw_cut_ij=sw3_ijparam.x;
int nbor_k=nbor_j-offset_j+offset_k; int nbor_k=nborj_start-offset_j+offset_k;
if (nbor_k<=nbor_j) int numk = dev_short_nbor[nbor_k-n_stride];
nbor_k+=n_stride; int k_end = nbor_k+fast_mul(numk,n_stride);
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) { for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k]; int k=dev_short_nbor[nbor_k];
k &= NEIGHMASK; k &= NEIGHMASK;
if (k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex); numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w; int ktype=kx.w;
ktype=map[ktype]; ktype=map[ktype];
@ -460,6 +533,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
const __global int * dev_acc, const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -494,8 +568,13 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
itype=map[itype]; itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j]; int j=dev_short_nbor[nbor_j];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -534,8 +613,13 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k; nbor_k+=offset_k;
} }
// recalculate numk and k_end for the use of short neighbor list
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
for ( ; nbor_k<k_end; nbor_k+=n_stride) { for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k]; int k=dev_short_nbor[nbor_k];
k &= NEIGHMASK; k &= NEIGHMASK;
if (k == i) continue; if (k == i) continue;
@ -598,6 +682,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
const __global int * dev_acc, const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -632,8 +717,13 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
itype=map[itype]; itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j]; int j=dev_short_nbor[nbor_j];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -672,8 +762,13 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k; nbor_k+=offset_k;
} }
// recalculate numk and k_end for the use of short neighbor list
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
for ( ; nbor_k<k_end; nbor_k+=n_stride) { for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k]; int k=dev_short_nbor[nbor_k];
k &= NEIGHMASK; k &= NEIGHMASK;
if (k == i) continue; if (k == i) continue;

View File

@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
int success; int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,vashishta,"k_vashishta","k_vashishta_three_center", _screen,vashishta,"k_vashishta","k_vashishta_three_center",
"k_vashishta_three_end"); "k_vashishta_three_end","k_vashishta_short_nbor");
if (success!=0) if (success!=0)
return success; return success;
@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
double r0sqmax = 0;
for (int i=0; i<nparams; i++) { for (int i=0; i<nparams; i++) {
double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4? double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
if (r0sqmax < r0sq) r0sqmax = r0sq;
dview[i].x=static_cast<numtyp>(r0sq); dview[i].x=static_cast<numtyp>(r0sq);
dview[i].y=static_cast<numtyp>(gamma[i]); dview[i].y=static_cast<numtyp>(gamma[i]);
dview[i].z=static_cast<numtyp>(cutsq[i]); dview[i].z=static_cast<numtyp>(cutsq[i]);
dview[i].w=static_cast<numtyp>(r0[i]); dview[i].w=static_cast<numtyp>(r0[i]);
} }
_cutshortsq = static_cast<numtyp>(r0sqmax);
ucl_copy(param4,dview,false); ucl_copy(param4,dview,false);
param4_tex.get_texture(*(this->pair_program),"param4_tex"); param4_tex.get_texture(*(this->pair_program),"param4_tex");
param4_tex.bind_float(param4,4); param4_tex.bind_float(param4,4);
@ -223,15 +226,27 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else else
vflag=0; vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ // build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom))); (BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &_cutshortsq,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum(); ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch(); nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start(); this->time_pair.start();
// note that k_pair does not run with the short neighbor list
this->k_pair.set_size(GX,BX); this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5, this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
@ -248,6 +263,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5, this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom); &nbor_pitch, &this->_threads_per_atom, &evatom);
Answer<numtyp,acctyp> *end_ans; Answer<numtyp,acctyp> *end_ans;
@ -257,21 +273,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
end_ans=this->ans; end_ans=this->ans;
#endif #endif
if (evatom!=0) { if (evatom!=0) {
this->k_three_end_vatom.set_size(GX,BX); this->k_three_end_vatom.set_size(GX,BX);
this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5, this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc, &this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else { } else {
this->k_three_end.set_size(GX,BX); this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5, this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc, &this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} }

View File

@ -136,6 +136,56 @@ texture<int4> param5_tex;
#endif #endif
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
const numtyp cutshortsq,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutshortsq) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_vashishta(const __global numtyp4 *restrict x_, __kernel void k_vashishta(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param1, const __global numtyp4 *restrict param1,
@ -176,7 +226,6 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
itype=map[itype]; itype=map[itype];
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor]; int j=dev_packed[nbor];
j &= NEIGHMASK; j &= NEIGHMASK;
@ -211,7 +260,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp param3_dvrc=param3_ijparam.z; numtyp param3_dvrc=param3_ijparam.z;
numtyp param3_c0 =param3_ijparam.w; numtyp param3_c0 =param3_ijparam.w;
numtyp r=sqrt(rsq); numtyp r=ucl_sqrt(rsq);
numtyp rinvsq=1.0/rsq; numtyp rinvsq=1.0/rsq;
numtyp r4inv = rinvsq*rinvsq; numtyp r4inv = rinvsq*rinvsq;
numtyp r6inv = rinvsq*r4inv; numtyp r6inv = rinvsq*r4inv;
@ -219,8 +268,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp reta = pow(r,-param1_eta); numtyp reta = pow(r,-param1_eta);
numtyp lam1r = r*param1_lam1inv; numtyp lam1r = r*param1_lam1inv;
numtyp lam4r = r*param1_lam4inv; numtyp lam4r = r*param1_lam4inv;
numtyp vc2 = param1_zizj * exp(-lam1r)/r; numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r); numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
numtyp force = (param2_dvrc*r numtyp force = (param2_dvrc*r
- (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv - (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
@ -353,6 +402,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
const int nelements, const int nelements,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -387,9 +437,14 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
itype=map[itype]; itype=map[itype];
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { // recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
int nborj_start = nbor_j;
nbor_end = nbor_j+fast_mul(numj,n_stride);
int j=dev_packed[nbor_j]; for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_short_nbor[nbor_j];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -410,14 +465,16 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
param_gamma_ij=param4_ijparam.y; param_gamma_ij=param4_ijparam.y;
param_r0_ij=param4_ijparam.w; param_r0_ij=param4_ijparam.w;
int nbor_k=nbor_j-offset_j+offset_k; int nbor_k=nborj_start-offset_j+offset_k;
if (nbor_k<=nbor_j) int numk = dev_short_nbor[nbor_k-n_stride];
nbor_k+=n_stride; int k_end = nbor_k+fast_mul(numk,n_stride);
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) { for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k]; int k=dev_short_nbor[nbor_k];
k &= NEIGHMASK; k &= NEIGHMASK;
if (k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex); numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w; int ktype=kx.w;
ktype=map[ktype]; ktype=map[ktype];
@ -478,6 +535,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
const __global int * dev_acc, const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -512,8 +570,13 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
itype=map[itype]; itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j]; int j=dev_short_nbor[nbor_j];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -551,8 +614,13 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k; nbor_k+=offset_k;
} }
// recalculate numk and k_end for the use of short neighbor list
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
for ( ; nbor_k<k_end; nbor_k+=n_stride) { for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k]; int k=dev_short_nbor[nbor_k];
k &= NEIGHMASK; k &= NEIGHMASK;
if (k == i) continue; if (k == i) continue;
@ -617,6 +685,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor, const __global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
const __global int * dev_acc, const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
const int eflag, const int vflag, const int eflag, const int vflag,
@ -651,8 +720,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w; int itype=ix.w;
itype=map[itype]; itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j]; int j=dev_short_nbor[nbor_j];
j &= NEIGHMASK; j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -690,8 +764,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k; nbor_k+=offset_k;
} }
// recalculate numk and k_end for the use of short neighbor list
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
for ( ; nbor_k<k_end; nbor_k+=n_stride) { for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k]; int k=dev_short_nbor[nbor_k];
k &= NEIGHMASK; k &= NEIGHMASK;
if (k == i) continue; if (k == i) continue;

View File

@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
UCL_D_Vec<int> elem2param; UCL_D_Vec<int> elem2param;
UCL_D_Vec<int> map; UCL_D_Vec<int> map;
int _nparams,_nelements; int _nparams,_nelements;
numtyp _cutshortsq;
UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex; UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;