Switched to the short neighbor list implementation in the pre-10Feb21 version (the recent version enforces tpa = 1 for short nbor)

This commit is contained in:
Trung Nguyen
2021-09-11 00:34:43 -05:00
parent 4ebe5833d3
commit 7f5a82dc54
5 changed files with 103 additions and 54 deletions

View File

@ -141,14 +141,31 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
int nbor_pitch=this->nbor->nbor_pitch(); int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start(); this->time_pair.start();
// Build the short neighbor list if needed
if (!this->short_nbor_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_off2, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_avail = true;
}
this->k_polar.set_size(GX,BX); this->k_polar.set_size(GX,BX);
this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->ans->force, &this->ans->engv, &this->_tep, &this->ans->force, &this->ans->engv, &this->_tep,
&eflag, &vflag, &ainum, &_nall, &nbor_pitch, &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &this->_threads_per_atom,
&_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
this->time_pair.stop(); this->time_pair.stop();
// Signal that short nbor list is not avail for the next time step
// do it here because polar_real() is the last kernel in a time step at this point
this->short_nbor_avail = false;
return GX; return GX;
} }
@ -163,20 +180,22 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
// Compute the block size and grid size to keep all cores busy // Compute the block size and grid size to keep all cores busy
const int BX=this->block_size(); const int BX=this->block_size();
int GX; int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
// Build the short neighbor list if needed
if (!this->short_nbor_avail) {
this->k_short_nbor.set_size(GX,BX);
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
&this->_nbor_data->begin(),
&this->dev_short_nbor, &_off2, &ainum,
&nbor_pitch, &this->_threads_per_atom);
this->short_nbor_avail = true;
}
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
this->k_short_nbor.set_size(GX,BX);
// NOTE: this->nbor->dev_packed is not allocated!!
/*
this->k_short_nbor.run(&this->atom->x, &_off2,
&this->nbor->dev_nbor, &this->nbor->dev_packed,
&ainum, &nbor_pitch, &this->_threads_per_atom);
*/
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->dev_short_nbor,
&this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_fieldp, &ainum, &_nall, &nbor_pitch,
&this->_threads_per_atom, &_aewald, &_off2, &this->_threads_per_atom, &_aewald, &_off2,
&_polar_dscale, &_polar_uscale); &_polar_dscale, &_polar_uscale);

View File

@ -196,6 +196,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sp_polar, const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor,
__global acctyp4 *restrict ans, __global acctyp4 *restrict ans,
__global acctyp *restrict engv, __global acctyp *restrict engv,
__global numtyp4 *restrict tep, __global numtyp4 *restrict tep,
@ -255,6 +256,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp; numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
int numj, nbor, nbor_end; int numj, nbor, nbor_end;
const __global int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -262,6 +264,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
//numtyp qtmp; fetch(qtmp,i,q_tex); //numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w; //int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
ci = polar1[i].x; // rpole[i][0]; ci = polar1[i].x; // rpole[i][0];
dix = polar1[i].y; // rpole[i][1]; dix = polar1[i].y; // rpole[i][1];
diy = polar1[i].z; // rpole[i][2]; diy = polar1[i].z; // rpole[i][2];
@ -289,7 +299,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int jextra=dev_packed[nbor]; int jextra=nbor_mem[nbor];
int j = jextra & NEIGHMASK15; int j = jextra & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -709,6 +719,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict sp_polar, const __global numtyp4 *restrict sp_polar,
const __global int *dev_nbor, const __global int *dev_nbor,
const __global int *dev_packed, const __global int *dev_packed,
const __global int *dev_short_nbor,
__global numtyp4 *restrict fieldp, __global numtyp4 *restrict fieldp,
const int inum, const int nall, const int inum, const int nall,
const int nbor_pitch, const int t_per_atom, const int nbor_pitch, const int t_per_atom,
@ -733,6 +744,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
if (ii<inum) { if (ii<inum) {
int numj, nbor, nbor_end; int numj, nbor, nbor_end;
const __global int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor); n_stride,nbor_end,nbor);
@ -740,6 +752,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
//numtyp qtmp; fetch(qtmp,i,q_tex); //numtyp qtmp; fetch(qtmp,i,q_tex);
//int itype=ix.w; //int itype=ix.w;
// recalculate numj and nbor_end for use of the short nbor list
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int itype,igroup; int itype,igroup;
numtyp bn[4],bcn[3]; numtyp bn[4],bcn[3];
numtyp fid[3],fip[3]; numtyp fid[3],fip[3];
@ -769,7 +789,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
for ( ; nbor<nbor_end; nbor+=n_stride) { for ( ; nbor<nbor_end; nbor+=n_stride) {
int jextra=dev_packed[nbor]; int jextra=nbor_mem[nbor];
int j = jextra & NEIGHMASK15; int j = jextra & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -1093,7 +1113,6 @@ __kernel void k_special15(__global int * dev_nbor,
} // if ii } // if ii
} }
/* /*
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_, __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
const numtyp off2, __global int * dev_nbor, const numtyp off2, __global int * dev_nbor,
@ -1149,38 +1168,36 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
} }
} }
*/ */
#ifdef LAL_SIMD_IP_SYNC
#define t_per_atom t_per_atom_in
#else
#define t_per_atom 1
#endif
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_, __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
const numtyp off2, const __global int * dev_nbor,
__global int * dev_nbor,
const __global int * dev_packed, const __global int * dev_packed,
__global int * dev_short_nbor,
const numtyp off2,
const int inum, const int nbor_pitch, const int inum, const int nbor_pitch,
const int t_per_atom_in) { const int t_per_atom) {
const int ii=GLOBAL_ID_X; __local int n_stride;
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
if (ii<inum) { if (ii<inum) {
/* int nbor, nbor_end;
const int i=dev_packed[ii]; int i, numj;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
int nbor=ii+nbor_pitch; n_stride,nbor_end,nbor);
const int numj=dev_packed[nbor];
nbor+=nbor_pitch;
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int newj=0;
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom; int ncount = 0;
const int out_stride=nbor_pitch*t_per_atom-t_per_atom; int m = nbor;
dev_short_nbor[m] = 0;
int nbor_short = nbor+n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_packed[nbor];
int nj = j;
j &= NEIGHMASK15;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int sj=dev_packed[nbor];
int j = sj & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12 // Compute r12
@ -1190,15 +1207,14 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
numtyp rsq = delx*delx+dely*dely+delz*delz; numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<off2) { if (rsq<off2) {
//*out_list=sj; dev_short_nbor[nbor_short] = nj;
out_list++; nbor_short += n_stride;
newj++; ncount++;
if ((newj & (t_per_atom-1))==0)
out_list+=out_stride;
} }
} // for nbor } // for nbor
//dev_nbor[ii+nbor_pitch]=newj;
*/ // store the number of neighbors for each thread
dev_short_nbor[m] = ncount;
} // if ii } // if ii
} }

View File

@ -21,7 +21,7 @@ namespace LAMMPS_AL {
extern Device<PRECISION,ACC_PRECISION> global_device; extern Device<PRECISION,ACC_PRECISION> global_device;
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) { BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) {
device=&global_device; device=&global_device;
ans=new Answer<numtyp,acctyp>(); ans=new Answer<numtyp,acctyp>();
nbor=new Neighbor(); nbor=new Neighbor();
@ -100,9 +100,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
} else { } else {
_nbor_data=&(nbor->dev_nbor); _nbor_data=&(nbor->dev_nbor);
} }
bool allocate_packed = false;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
_gpu_host,max_nbors,cell_size,false,_threads_per_atom); _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom);
if (success!=0) if (success!=0)
return success; return success;
@ -126,6 +127,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
if (ef_nall==0) if (ef_nall==0)
ef_nall=2000; ef_nall=2000;
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10); _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
@ -158,6 +161,7 @@ void BaseAmoebaT::clear_atomic() {
time_pair.clear(); time_pair.clear();
hd_balancer.clear(); hd_balancer.clear();
dev_short_nbor.clear();
nbor->clear(); nbor->clear();
ans->clear(); ans->clear();
@ -195,7 +199,7 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
// Build neighbor list on device // Build neighbor list on device
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, const int nall, double **host_x,
int *host_type, double *sublo, int *host_type, double *sublo,
double *subhi, tagint *tag, double *subhi, tagint *tag,
@ -206,7 +210,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
resize_atom(inum,nall,success); resize_atom(inum,nall,success);
resize_local(inum,host_inum,nbor->max_nbors(),success); resize_local(inum,host_inum,nbor->max_nbors(),success);
if (!success) if (!success)
return; return 0;
atom->cast_copy_x(host_x,host_type); atom->cast_copy_x(host_x,host_type);
int mn; int mn;
@ -232,6 +236,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes) if (bytes>_max_an_bytes)
_max_an_bytes=bytes; _max_an_bytes=bytes;
return mn;
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@ -385,7 +390,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
// Build neighbor list on GPU if necessary // Build neighbor list on GPU if necessary
if (ago==0) { if (ago==0) {
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
sublo, subhi, tag, nspecial, special, nspecial15, special15, sublo, subhi, tag, nspecial, special, nspecial15, special15,
success); success);
if (!success) if (!success)
@ -409,6 +414,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
boxlo, prd); boxlo, prd);
// re-allocate dev_short_nbor if necessary
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
dev_short_nbor.resize((2+_max_nbors)*_nmax);
}
return nbor->host_jlist.begin()-host_start; return nbor->host_jlist.begin()-host_start;
} }

View File

@ -123,7 +123,7 @@ class BaseAmoeba {
int **firstneigh, bool &success); int **firstneigh, bool &success);
/// Build neighbor list on device /// Build neighbor list on device
void build_nbor_list(const int inum, const int host_inum, int build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x, int *host_type, const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial, double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, int *nspecial15, tagint **special15, tagint **special, int *nspecial15, tagint **special15,
@ -236,6 +236,8 @@ class BaseAmoeba {
int add_onefive_neighbors(); int add_onefive_neighbors();
UCL_D_Vec<int> dev_short_nbor;
// ------------------------- DEVICE KERNELS ------------------------- // ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program; UCL_Program *pair_program;
UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
@ -251,8 +253,9 @@ class BaseAmoeba {
bool _compiled; bool _compiled;
int _block_size, _block_bio_size, _threads_per_atom; int _block_size, _block_bio_size, _threads_per_atom;
int _extra_fields; int _extra_fields;
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15; double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
double _gpu_overhead, _driver_overhead; double _gpu_overhead, _driver_overhead;
bool short_nbor_avail;
UCL_D_Vec<int> *_nbor_data; UCL_D_Vec<int> *_nbor_data;
void compile_kernels(UCL_Device &dev, const void *pair_string, void compile_kernels(UCL_Device &dev, const void *pair_string,

View File

@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
gpu_udirect2b_ready = true; gpu_udirect2b_ready = true;
gpu_umutual2b_ready = false; gpu_umutual2b_ready = false;
gpu_polar_real_ready = false; gpu_polar_real_ready = true;
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
} }