Merge pull request #597 from ndtrung81/three-body-short-nlist

Implementing short neighbor lists for three-body gpu styles
This commit is contained in:
Steve Plimpton
2017-08-17 11:31:59 -06:00
committed by GitHub
18 changed files with 1181 additions and 902 deletions

View File

@ -22,21 +22,21 @@
offset=tid & (t_per_atom-1); \
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
if (nbor_mem==packed_mem) { \
nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
i, numj, n_stride, nbor_end, nbor_begin) \
i=dev_nbor[ii]; \
nbor_begin=ii+nbor_pitch; \
numj=dev_nbor[nbor_begin]; \
if (dev_nbor==dev_packed) { \
nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
n_stride=fast_mul(t_per_atom,nbor_pitch); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
nbor_begin+=offset; \
} else { \
nbor_begin+=nbor_stride; \
nbor_begin=nbor_mem[nbor_begin]; \
nbor_begin+=nbor_pitch; \
nbor_begin=dev_nbor[nbor_begin]; \
nbor_end=nbor_begin+numj; \
stride=t_per_atom; \
n_stride=t_per_atom; \
nbor_begin+=offset; \
}

View File

@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
const int max_nbors, const int maxspecial,
const double cell_size, const double gpu_split,
FILE *_screen, const void *pair_program,
const char *k_two, const char *k_three_center,
const char *k_three_end) {
const char *two, const char *three_center,
const char *three_end, const char *short_nbor) {
screen=_screen;
int gpu_nbor=0;
@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_gpu_host=1;
_threads_per_atom=device->threads_per_atom();
if (_threads_per_atom>1 && gpu_nbor==0) {
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
} else // neigh yes or tpa == 1
_nbor_data=&(nbor->dev_nbor);
if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10;
@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_block_pair=device->pair_block_size();
_block_size=device->block_ellipse();
compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
// Initialize host-device load balancer
hd_balancer.init(device,gpu_nbor,gpu_split);
@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
_max_an_bytes+=ans2->gpu_bytes();
#endif
int ef_nall=nall;
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
return 0;
}
@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
k_three_end.clear();
k_three_end_vatom.clear();
k_pair.clear();
k_short_nbor.clear();
delete pair_program;
_compiled=false;
}
@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
time_pair.clear();
hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear();
ans->clear();
#ifdef THREE_CONCURRENT
@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
if (!success)
return NULL;
_nall = nall;
// originally the requirement that nall == nlist was enforced
// to allow direct indexing neighbors of neighbors after re-arrangement
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
return 0;
atom->cast_copy_x(host_x,host_type);
_nall = nall;
int mn;
nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
nspecial, special, success, mn);
@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
}
atom->cast_x_data(host_x,host_type);
hd_balancer.start_timer();
atom->add_x_data(host_x,host_type);
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nlist;
int evatom=0;
if (eatom || vatom)
evatom=1;
@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
*ilist=nbor->host_ilist.begin();
*jnum=nbor->host_acc.begin();
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
// _ainum to be used in loop() for short neighbor list build
_ainum = nall;
int evatom=0;
if (eatom || vatom)
evatom=1;
@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
template <class numtyp, class acctyp>
void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *ktwo, const char *kthree_center,
const char *kthree_end) {
const char *two, const char *three_center,
const char *three_end, const char* short_nbor) {
if (_compiled)
return;
std::string vatom_name=std::string(kthree_end)+"_vatom";
std::string vatom_name=std::string(three_end)+"_vatom";
pair_program=new UCL_Program(dev);
pair_program->load_string(pair_str,device->compile_string().c_str());
k_three_center.set_function(*pair_program,kthree_center);
k_three_end.set_function(*pair_program,kthree_end);
k_three_center.set_function(*pair_program,three_center);
k_three_end.set_function(*pair_program,three_end);
k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
k_pair.set_function(*pair_program,ktwo);
k_pair.set_function(*pair_program,two);
k_short_nbor.set_function(*pair_program,short_nbor);
pos_tex.get_texture(*pair_program,"pos_tex");
#ifdef THREE_CONCURRENT

View File

@ -56,7 +56,8 @@ class BaseThree {
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const void *pair_program, const char *k_two,
const char *k_three_center, const char *k_three_end);
const char *k_three_center, const char *k_three_end,
const char *k_short_nbor=NULL);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead();
@ -73,18 +74,18 @@ class BaseThree {
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
/** \param inum number of particles whose nbors must be stored on device
* \param max_nbors maximum number of neighbors
* \param success set to false if insufficient memory
* \note olist_size=total number of local particles **/
inline void resize_local(const int inum, const int max_nbors, bool &success) {
nbor->resize(inum,max_nbors,success);
}
/// Check if there is enough storage for neighbors and realloc if not
/** \param nlocal number of particles whose nbors must be stored on device
/** \param inum number of particles whose nbors must be stored on device
* \param host_inum number of particles whose nbors need to copied to host
* \param current maximum number of neighbors
* \param max_nbors current maximum number of neighbors
* \note host_inum is 0 if the host is performing neighboring
* \note nlocal+host_inum=total number local particles
* \note olist_size=0 **/
@ -143,14 +144,6 @@ class BaseThree {
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int * compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
@ -193,6 +186,9 @@ class BaseThree {
/// Neighbor data
Neighbor *nbor;
UCL_D_Vec<int> dev_short_nbor;
UCL_Kernel k_short_nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@ -207,12 +203,13 @@ class BaseThree {
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor;
double _max_bytes, _max_an_bytes;
int _max_nbors, _ainum, _nall;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *k_two, const char *k_three_center,
const char *k_three_end);
const char *two, const char *three_center,
const char *three_end, const char* short_nbor);
virtual void loop(const bool _eflag, const bool _vflag,
const int evatom) = 0;

View File

@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,sw,"k_sw","k_sw_three_center",
"k_sw_three_end");
"k_sw_three_end","k_sw_short_nbor");
if (success!=0)
return success;
@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

View File

@ -130,6 +130,63 @@ texture<int4> sw3_tex;
#endif
__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw3,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch, const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_sw(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sw1,
@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem = dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
int nbor_k,k_end;
if (dev_packed==dev_nbor) {
nbor_k=nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
} else {
nbor_k = nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j) nbor_k += n_stride;
k_end = nbor_end;
}
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (dev_packed==dev_nbor && k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;

View File

@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff,"k_tersoff_repulsive",
"k_tersoff_three_center", "k_tersoff_three_end");
"k_tersoff_three_center", "k_tersoff_three_end",
"k_tersoff_short_nbor");
if (success!=0)
return success;
@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -106,7 +106,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
#endif
__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
}
} // for nbor

View File

@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
const double* h, const double* gamma, const double* beta,
const double* powern, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
int _max_nbors;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff_mod,"k_tersoff_mod_repulsive",
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
"k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
"k_tersoff_mod_short_nbor");
if (success!=0)
return success;
@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffMT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -106,7 +106,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;
#endif
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (numtyp)0;
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
const double* h, const double* beta, const double* powern,
const double* powern_del, const double* ca1, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
int _max_nbors;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
"k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
"k_tersoff_zbl_short_nbor");
if (success!=0)
return success;
@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
UCL_WRITE_ONLY);
for (int i=0; i<nparams; i++)
double cutsqmax = 0.0;
for (int i=0; i<nparams; i++) {
cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
}
cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
ucl_copy(cutsq,cutsq_view,false);
_cutshortsq = static_cast<numtyp>(cutsqmax);
UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
*(this->ucl_device), UCL_WRITE_ONLY);
@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const {
#define KTHREADS this->_threads_per_atom
#define JTHREADS this->_threads_per_atom
// ---------------------------------------------------------------------------
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return;
}
int ago=this->hd_balancer.ago_first(f_ago);
int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
if (ago==0) {
this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
if (!success)
return;
_max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
}
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary and then compute forces, virials, energies
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int ** TersoffZT::compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag,
int **nspecial, tagint **special, const bool eflag,
const bool vflag, const bool eatom,
const bool vatom, int &host_start,
int **ilist, int **jnum,
const double cpu_time, bool &success) {
this->acc_timers();
if (inum_full==0) {
host_start=0;
// Make sure textures are correct if realloc by a different hybrid style
this->resize_atom(0,nall,success);
this->zero_timers();
return NULL;
}
this->hd_balancer.balance(cpu_time);
int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
this->ans->inum(inum);
#ifdef THREE_CONCURRENT
this->ans2->inum(inum);
#endif
host_start=inum;
// Build neighbor list on GPU if necessary
if (ago==0) {
_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, success);
if (!success)
return NULL;
this->hd_balancer.start_timer();
} else {
this->atom->cast_x_data(host_x,host_type);
this->hd_balancer.start_timer();
this->atom->add_x_data(host_x,host_type);
}
*ilist=this->nbor->host_ilist.begin();
*jnum=this->nbor->host_acc.begin();
// re-allocate zetaij if necessary
if (nall*_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(_max_nbors*_nmax);
}
int _eflag;
if (eflag)
_eflag=1;
else
_eflag=0;
int ainum=nall;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
evatom=1;
#ifdef THREE_CONCURRENT
this->ucl_device->sync();
#endif
loop(eflag,vflag,evatom);
this->ans->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans);
#ifdef THREE_CONCURRENT
this->ans2->copy_answers(eflag,vflag,eatom,vatom);
this->device->add_ans_object(this->ans2);
#endif
this->hd_balancer.stop_timer();
return this->nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Calculate energies, forces, and torques
// ---------------------------------------------------------------------------
@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int ainum=this->ans->inum();
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// re-allocate zetaij if necessary
int nall = this->_nall;
if (nall*this->_max_nbors > _zetaij.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
_zetaij.resize(this->_max_nbors*_nmax);
}
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
(BX/(JTHREADS*KTHREADS))));
this->k_zeta.set_size(GX,BX);
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
&_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
&map, &elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv,
&eflag, &vflag, &ainum, &nbor_pitch,
&this->_threads_per_atom);
@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -109,7 +109,7 @@ texture<int4> ts6_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
__local acctyp red_acc[BLOCK_PAIR]; \
red_acc[tid]=z; \
@ -158,7 +158,7 @@ texture<int4> ts6_tex;
ans[ii]=old; \
}
#define store_zeta(z, tid, t_per_atom, offset) \
#define acc_zeta(z, tid, t_per_atom, offset) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
z += shfl_xor(z, s, t_per_atom); \
@ -167,6 +167,65 @@ texture<int4> ts6_tex;
#endif
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
// Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
// while the block size should never be less than 32.
// SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
delr1.z = jx.z-ix.z;
numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
if (rsq1 > cutsq[ijparam]) continue;
// compute zeta_ij
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
store_zeta(z, tid, t_per_atom, offset_k);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
numtyp ijparam_lam2 = ts1_ijparam.y;
@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<cutsq[ijparam]) {
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
numtyp4 ts6_ijparam = ts6[ijparam];
numtyp ijparam_Z_i = ts6_ijparam.x;
numtyp ijparam_Z_j = ts6_ijparam.y;
numtyp ijparam_ZBLcut = ts6_ijparam.z;
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
// rsq<cutsq[ijparam]
numtyp feng[2];
numtyp ijparam_lam1 = ts1[ijparam].x;
numtyp4 ts2_ijparam = ts2[ijparam];
numtyp ijparam_biga = ts2_ijparam.x;
numtyp ijparam_bigr = ts2_ijparam.z;
numtyp ijparam_bigd = ts2_ijparam.w;
numtyp4 ts6_ijparam = ts6[ijparam];
numtyp ijparam_Z_i = ts6_ijparam.x;
numtyp ijparam_Z_j = ts6_ijparam.y;
numtyp ijparam_ZBLcut = ts6_ijparam.z;
numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
numtyp force = feng[0];
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
if (eflag>0)
energy+=feng[1];
if (vflag>0) {
virial[0] += delx*delx*force;
virial[1] += dely*dely*force;
virial[2] += delz*delz*force;
virial[3] += delx*dely*force;
virial[4] += delx*delz*force;
virial[5] += dely*delz*force;
}
} // for nbor
@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
int offset_k=tid & (t_per_atom-1);
int nborj_start = nbor_j;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp r1 = ucl_sqrt(rsq1);
numtyp r1inv = ucl_rsqrt(rsq1);
@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
i, nbor_j, offset_j, idx);
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
numtyp force = zeta_ij.x*tpainv;
numtyp prefactor = zeta_ij.y;
@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
virial[5] += delr1[1]*delr1[2]*mforce;
}
int nbor_k=nborj_start-offset_j+offset_k;
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int nbor_k = nborj_start-offset_j+offset_k;
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
itype=map[itype];
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
delr1[2] = jx.z-ix.z;
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
if (rsq1 > cutsq[ijparam]) continue;
numtyp mdelr1[3];
mdelr1[0] = -delr1[0];
mdelr1[1] = -delr1[1];
@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
k_end=nbor_k+numk;
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, ijnum, offset_kf, idx);
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
numtyp force = zeta_ji.x*tpainv;
numtyp prefactor_ji = zeta_ji.y;
@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
int idx;
zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
j, nbor_k, offset_k, idx);
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
numtyp prefactor_jk = zeta_jk.y;

View File

@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
const double* ZBLcut, const double* ZBLexpscale, const double global_e,
const double global_a_0, const double global_epsilon_0, const double* cutsq);
/// Pair loop with host neighboring
void compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh, const bool eflag,
const bool vflag, const bool eatom, const bool vatom,
int &host_start, const double cpu_time, bool &success);
/// Pair loop with device neighboring
int ** compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type, double *sublo,
double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **numj, const double cpu_time, bool &success);
/// Clear all host and device data
/** \note This is called at the beginning of the init() routine **/
void clear();
@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
UCL_Kernel k_zeta;
UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
int _max_nbors;
numtyp _global_e,_global_a_0,_global_epsilon_0;
numtyp _cutshortsq;
private:
bool _allocated;

View File

@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
int success;
success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
_screen,vashishta,"k_vashishta","k_vashishta_three_center",
"k_vashishta_three_end");
"k_vashishta_three_end","k_vashishta_short_nbor");
if (success!=0)
return success;
@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
double r0sqmax = 0;
for (int i=0; i<nparams; i++) {
double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
if (r0sqmax < r0sq) r0sqmax = r0sq;
dview[i].x=static_cast<numtyp>(r0sq);
dview[i].y=static_cast<numtyp>(gamma[i]);
dview[i].z=static_cast<numtyp>(cutsq[i]);
dview[i].w=static_cast<numtyp>(r0[i]);
}
_cutshortsq = static_cast<numtyp>(r0sqmax);
ucl_copy(param4,dview,false);
param4_tex.get_texture(*(this->pair_program),"param4_tex");
param4_tex.bind_float(param4,4);
@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
else
vflag=0;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
// build the short neighbor list
int ainum=this->_ainum;
int nbor_pitch=this->nbor->nbor_pitch();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
(BX/this->_threads_per_atom)));
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &param4, &map,
&elem2param, &_nelements, &_nparams,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor, &ainum,
&nbor_pitch, &this->_threads_per_atom);
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
ainum=this->ans->inum();
nbor_pitch=this->nbor->nbor_pitch();
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
this->time_pair.start();
// note that k_pair does not run with the short neighbor list
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &evatom);
Answer<numtyp,acctyp> *end_ans;
@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
end_ans=this->ans;
#endif
if (evatom!=0) {
this->k_three_end_vatom.set_size(GX,BX);
this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&this->nbor->dev_acc, &this->dev_short_nbor,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}

View File

@ -136,6 +136,64 @@ texture<int4> param5_tex;
#endif
__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param4,
const __global int *restrict map,
const __global int *restrict elem2param,
const int nelements, const int nparams,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const int inum, const int nbor_pitch,
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype=ix.w;
itype=map[itype];
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
int jtype=jx.w;
jtype=map[jtype];
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}
__kernel void k_vashishta(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict param1,
@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp param3_dvrc=param3_ijparam.z;
numtyp param3_c0 =param3_ijparam.w;
numtyp r=sqrt(rsq);
numtyp r=ucl_sqrt(rsq);
numtyp rinvsq=1.0/rsq;
numtyp r4inv = rinvsq*rinvsq;
numtyp r6inv = rinvsq*r4inv;
@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp reta = pow(r,-param1_eta);
numtyp lam1r = r*param1_lam1inv;
numtyp lam4r = r*param1_lam4inv;
numtyp vc2 = param1_zizj * exp(-lam1r)/r;
numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
numtyp force = (param2_dvrc*r
- (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
f.x+=delx*force;
f.y+=dely*force;
f.z+=delz*force;
if (eflag>0)
energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rinvsq2 = ucl_recip(rsq2); \
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinvsq = pcsinv*pcsinv; \
numtyp pcs = delcssq/pcsinv; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp frad2 = facrad*gsrainvsq2; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
numtyp r1 = ucl_sqrt(rsq1); \
numtyp rinvsq1 = ucl_recip(rsq1); \
numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainv1 = param_gamma_ij * rainv1; \
numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \
numtyp expgsrainv1 = ucl_exp(gsrainv1); \
\
numtyp r2 = ucl_sqrt(rsq2); \
numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp gsrainv2 = param_gamma_ik * rainv2; \
numtyp expgsrainv2 = ucl_exp(gsrainv2); \
\
numtyp rinv12 = ucl_recip(r1*r2); \
numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcs = cs - param_costheta_ijk; \
numtyp delcssq = delcs*delcs; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \
numtyp pcsinvsq = pcsinv*pcsinv; \
numtyp pcs = delcssq/pcsinv; \
\
numtyp facexp = expgsrainv1*expgsrainv2; \
\
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp facrad = param_bigb_ijk * facexp*pcs; \
numtyp frad1 = facrad*gsrainvsq1; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \
numtyp facang12 = rinv12*facang; \
numtyp csfacang = cs*facang; \
numtyp csfac1 = rinvsq1*csfacang; \
@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij=param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij=param4_ijparam.w;
int nbor_k=nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j)
nbor_k+=n_stride;
int nbor_k,k_end;
if (dev_packed==dev_nbor) {
nbor_k=nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
} else {
nbor_k = nbor_j-offset_j+offset_k;
if (nbor_k<=nbor_j) nbor_k += n_stride;
k_end = nbor_end;
}
for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (dev_packed==dev_nbor && k <= j) continue;
numtyp4 kx; fetch4(kx,k,pos_tex);
int ktype=kx.w;
ktype=map[ktype];
@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij = param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij = param4_ijparam.w;
@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
const __global int * dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
int itype=ix.w;
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_packed[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
param_r0sq_ij=param4_ijparam.x;
if (rsq1 > param_r0sq_ij) continue;
if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
param_gamma_ij=param4_ijparam.y;
param_r0_ij=param4_ijparam.w;
@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
nbor_k+=offset_k;
}
// recalculate numk and k_end for the use of short neighbor list
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_packed[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;

View File

@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
UCL_D_Vec<int> elem2param;
UCL_D_Vec<int> map;
int _nparams,_nelements;
numtyp _cutshortsq;
UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;

View File

@ -71,6 +71,22 @@ static const char cite_gpu_package[] =
" year = 2013,\n"
" volume = 184,\n"
" pages = {2785--2793}\n"
"}\n\n"
"@Article{Trung15,\n"
" author = {T. D. Nguyen, S. J. Plimpton},\n"
" title = {Accelerating dissipative particle dynamics simulations for soft matter systems},\n"
" journal = {Comput.~Mater.~Sci.},\n"
" year = 2015,\n"
" volume = 100,\n"
" pages = {173--180}\n"
"}\n\n"
"@Article{Trung17,\n"
" author = {T. D. Nguyen},\n"
" title = {GPU-accelerated Tersoff potentials for massively parallel Molecular Dynamics simulations},\n"
" journal = {Comp.~Phys.~Comm.},\n"
" year = 2017,\n"
" volume = 212,\n"
" pages = {113--122}\n"
"}\n\n";
/* ---------------------------------------------------------------------- */