Switched to the short neighbor list implementation in the pre-10Feb21 version (the recent version enforces tpa = 1 for short nbor)

This commit is contained in:
Trung Nguyen
2021-09-11 00:34:43 -05:00
parent 4ebe5833d3
commit 7f5a82dc54
5 changed files with 103 additions and 54 deletions

View File

@ -141,14 +141,31 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
// Build the short neighbor list if needed
if (!this->short_nbor_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_off2, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_avail = true;
}
this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom,
&_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
this->time_pair.stop();
// Signal that short nbor list is not avail for the next time step
// do it here because polar_real() is the last kernel in a time step at this point
this->short_nbor_avail = false;
return GX;
}
@ -163,20 +180,22 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX;
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
// Build the short neighbor list if needed
if (!this->short_nbor_avail) {
this->k_short_nbor.set_size(GX,BX);
// NOTE: this->nbor->dev_packed is not allocated!!
/*
this->k_short_nbor.run(&this->atom->x, &_off2,
&this->nbor->dev_nbor, &this->nbor->dev_packed,
&ainum, &nbor_pitch, &this->_threads_per_atom);
*/
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_off2, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_avail = true;
}
this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &_aewald, &_off2,
&_polar_dscale, &_polar_uscale);

View File

@ -196,6 +196,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
const __global int *dev_short_nbor,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
__global numtyp4 *restrict tep,
@ -255,6 +256,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
int numj, nbor, nbor_end;
const __global int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -262,6 +264,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
//numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
ci = polar1[i].x; // rpole[i][0];
dix = polar1[i].y; // rpole[i][1];
diy = polar1[i].z; // rpole[i][2];
@ -289,7 +299,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
for ( ; nbor<nbor_end; nbor+=n_stride) {
int jextra=dev_packed[nbor];
int jextra=nbor_mem[nbor];
int j = jextra & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -709,6 +719,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor,
const __global int *dev_packed,
const __global int *dev_short_nbor,
__global numtyp4 *restrict fieldp,
const int inum, const int nall,
const int nbor_pitch, const int t_per_atom,
@ -733,6 +744,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
if (ii<inum) {
int numj, nbor, nbor_end;
const __global int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -740,6 +752,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
//numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int itype,igroup;
numtyp bn[4],bcn[3];
numtyp fid[3],fip[3];
@ -769,7 +789,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
for ( ; nbor<nbor_end; nbor+=n_stride) {
int jextra=dev_packed[nbor];
int jextra=nbor_mem[nbor];
int j = jextra & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -1093,7 +1113,6 @@ __kernel void k_special15(__global int * dev_nbor,
} // if ii
}
/*
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
const numtyp off2, __global int * dev_nbor,
@ -1149,38 +1168,36 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
}
}
*/
#ifdef LAL_SIMD_IP_SYNC
#define t_per_atom t_per_atom_in
#else
#define t_per_atom 1
#endif
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
const numtyp off2,
__global int * dev_nbor,
const __global int * dev_nbor,
const __global int * dev_packed,
__global int * dev_short_nbor,
const numtyp off2,
const int inum, const int nbor_pitch,
const int t_per_atom_in) {
const int ii=GLOBAL_ID_X;
const int t_per_atom) {
__local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) {
/*
const int i=dev_packed[ii];
int nbor=ii+nbor_pitch;
const int numj=dev_packed[nbor];
nbor+=nbor_pitch;
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
int nbor, nbor_end;
int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int newj=0;
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
int ncount = 0;
int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK15;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int sj=dev_packed[nbor];
int j = sj & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
@ -1190,15 +1207,14 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<off2) {
//*out_list=sj;
out_list++;
newj++;
if ((newj & (t_per_atom-1))==0)
out_list+=out_stride;
dev_short_nbor[nbor_short] = nj;
nbor_short += n_stride;
ncount++;
}
} // for nbor
//dev_nbor[ii+nbor_pitch]=newj;
*/
// store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii
}

View File

@ -21,7 +21,7 @@ namespace LAMMPS_AL {
extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp>
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) {
device=&global_device;
ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor();
@ -101,8 +101,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
_nbor_data=&(nbor->dev_nbor);
}
bool allocate_packed = false;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
_gpu_host,max_nbors,cell_size,false,_threads_per_atom);
_gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom);
if (success!=0)
return success;
@ -126,6 +127,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
if (ef_nall==0)
ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
@ -158,6 +161,7 @@ void BaseAmoebaT::clear_atomic() {
time_pair.clear();
hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear();
ans->clear();
@ -195,7 +199,7 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
// Build neighbor list on device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
@ -206,7 +210,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success)
return;
return 0;
atom->cast_copy_x(host_x,host_type);
int mn;
@ -232,6 +236,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
return mn;
}
// ---------------------------------------------------------------------------
@ -385,7 +390,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
// Build neighbor list on GPU if necessary
if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, nspecial15, special15,
success);
if (!success)
@ -409,6 +414,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd);
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
return nbor->host_jlist.begin()-host_start;
}

View File

@ -123,7 +123,7 @@ class BaseAmoeba {
int **firstneigh, bool &success);
/// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum,
int build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint **special15,
@ -236,6 +236,8 @@ class BaseAmoeba {
int add_onefive_neighbors();
UCL_D_Vec<int> dev_short_nbor;
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
@ -251,8 +253,9 @@ class BaseAmoeba {
bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom;
int _extra_fields;
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
double _gpu_overhead, _driver_overhead;
bool short_nbor_avail;
UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string,

View File

@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
gpu_udirect2b_ready = true;
gpu_umutual2b_ready = false;
gpu_polar_real_ready = false;
gpu_polar_real_ready = true;
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
}