Switched to the short neighbor list implementation in the pre-10Feb21 version (the recent version enforces tpa = 1 for short nbor)
This commit is contained in:
@ -141,14 +141,31 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
||||
// Build the short neighbor list if needed
|
||||
if (!this->short_nbor_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &_off2, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->short_nbor_avail = true;
|
||||
}
|
||||
|
||||
this->k_polar.set_size(GX,BX);
|
||||
this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->ans->force, &this->ans->engv, &this->_tep,
|
||||
&eflag, &vflag, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom,
|
||||
&_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
|
||||
this->time_pair.stop();
|
||||
|
||||
// Signal that short nbor list is not avail for the next time step
|
||||
// do it here because polar_real() is the last kernel in a time step at this point
|
||||
|
||||
this->short_nbor_avail = false;
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
@ -163,20 +180,22 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX;
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
|
||||
|
||||
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||
// Build the short neighbor list if needed
|
||||
if (!this->short_nbor_avail) {
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
// NOTE: this->nbor->dev_packed is not allocated!!
|
||||
/*
|
||||
this->k_short_nbor.run(&this->atom->x, &_off2,
|
||||
&this->nbor->dev_nbor, &this->nbor->dev_packed,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
*/
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
|
||||
this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
|
||||
&this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &_off2, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
this->short_nbor_avail = true;
|
||||
}
|
||||
|
||||
this->k_udirect2b.set_size(GX,BX);
|
||||
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor,
|
||||
&this->_fieldp, &ainum, &_nall, &nbor_pitch,
|
||||
&this->_threads_per_atom, &_aewald, &_off2,
|
||||
&_polar_dscale, &_polar_uscale);
|
||||
|
||||
@ -196,6 +196,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict sp_polar,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global numtyp4 *restrict tep,
|
||||
@ -255,6 +256,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
|
||||
|
||||
int numj, nbor, nbor_end;
|
||||
const __global int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -262,6 +264,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
//numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
//int itype=ix.w;
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
ci = polar1[i].x; // rpole[i][0];
|
||||
dix = polar1[i].y; // rpole[i][1];
|
||||
diy = polar1[i].z; // rpole[i][2];
|
||||
@ -289,7 +299,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int jextra=dev_packed[nbor];
|
||||
int jextra=nbor_mem[nbor];
|
||||
int j = jextra & NEIGHMASK15;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -709,6 +719,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict sp_polar,
|
||||
const __global int *dev_nbor,
|
||||
const __global int *dev_packed,
|
||||
const __global int *dev_short_nbor,
|
||||
__global numtyp4 *restrict fieldp,
|
||||
const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
@ -733,6 +744,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (ii<inum) {
|
||||
int numj, nbor, nbor_end;
|
||||
const __global int* nbor_mem=dev_packed;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
@ -740,6 +752,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
//numtyp qtmp; fetch(qtmp,i,q_tex);
|
||||
//int itype=ix.w;
|
||||
|
||||
// recalculate numj and nbor_end for use of the short nbor list
|
||||
if (dev_packed==dev_nbor) {
|
||||
numj = dev_short_nbor[nbor];
|
||||
nbor += n_stride;
|
||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||
nbor_mem = dev_short_nbor;
|
||||
}
|
||||
|
||||
int itype,igroup;
|
||||
numtyp bn[4],bcn[3];
|
||||
numtyp fid[3],fip[3];
|
||||
@ -769,7 +789,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int jextra=dev_packed[nbor];
|
||||
int jextra=nbor_mem[nbor];
|
||||
int j = jextra & NEIGHMASK15;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
@ -1093,7 +1113,6 @@ __kernel void k_special15(__global int * dev_nbor,
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
/*
|
||||
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const numtyp off2, __global int * dev_nbor,
|
||||
@ -1149,38 +1168,36 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
}
|
||||
*/
|
||||
#ifdef LAL_SIMD_IP_SYNC
|
||||
#define t_per_atom t_per_atom_in
|
||||
#else
|
||||
#define t_per_atom 1
|
||||
#endif
|
||||
|
||||
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const numtyp off2,
|
||||
__global int * dev_nbor,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const numtyp off2,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom_in) {
|
||||
const int ii=GLOBAL_ID_X;
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
if (ii<inum) {
|
||||
/*
|
||||
const int i=dev_packed[ii];
|
||||
|
||||
int nbor=ii+nbor_pitch;
|
||||
const int numj=dev_packed[nbor];
|
||||
nbor+=nbor_pitch;
|
||||
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
int nbor, nbor_end;
|
||||
int i, numj;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int newj=0;
|
||||
|
||||
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
|
||||
const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
dev_short_nbor[m] = 0;
|
||||
int nbor_short = nbor+n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int j=dev_packed[nbor];
|
||||
int nj = j;
|
||||
j &= NEIGHMASK15;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
int sj=dev_packed[nbor];
|
||||
int j = sj & NEIGHMASK15;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
// Compute r12
|
||||
@ -1190,15 +1207,14 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<off2) {
|
||||
//*out_list=sj;
|
||||
out_list++;
|
||||
newj++;
|
||||
if ((newj & (t_per_atom-1))==0)
|
||||
out_list+=out_stride;
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
//dev_nbor[ii+nbor_pitch]=newj;
|
||||
*/
|
||||
|
||||
// store the number of neighbors for each thread
|
||||
dev_short_nbor[m] = ncount;
|
||||
|
||||
} // if ii
|
||||
}
|
||||
@ -21,7 +21,7 @@ namespace LAMMPS_AL {
|
||||
extern Device<PRECISION,ACC_PRECISION> global_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
|
||||
BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) {
|
||||
device=&global_device;
|
||||
ans=new Answer<numtyp,acctyp>();
|
||||
nbor=new Neighbor();
|
||||
@ -101,8 +101,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
}
|
||||
|
||||
bool allocate_packed = false;
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
|
||||
_gpu_host,max_nbors,cell_size,false,_threads_per_atom);
|
||||
_gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -126,6 +127,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
|
||||
|
||||
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
|
||||
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
|
||||
@ -158,6 +161,7 @@ void BaseAmoebaT::clear_atomic() {
|
||||
time_pair.clear();
|
||||
hd_balancer.clear();
|
||||
|
||||
dev_short_nbor.clear();
|
||||
nbor->clear();
|
||||
ans->clear();
|
||||
|
||||
@ -195,7 +199,7 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
|
||||
// Build neighbor list on device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
||||
inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
@ -206,7 +210,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(inum,host_inum,nbor->max_nbors(),success);
|
||||
if (!success)
|
||||
return;
|
||||
return 0;
|
||||
atom->cast_copy_x(host_x,host_type);
|
||||
|
||||
int mn;
|
||||
@ -232,6 +236,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
return mn;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -385,7 +390,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
|
||||
|
||||
// Build neighbor list on GPU if necessary
|
||||
if (ago==0) {
|
||||
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
_max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
|
||||
sublo, subhi, tag, nspecial, special, nspecial15, special15,
|
||||
success);
|
||||
if (!success)
|
||||
@ -409,6 +414,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
|
||||
device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
|
||||
boxlo, prd);
|
||||
|
||||
// re-allocate dev_short_nbor if necessary
|
||||
if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
|
||||
int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
dev_short_nbor.resize((2+_max_nbors)*_nmax);
|
||||
}
|
||||
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
|
||||
@ -123,7 +123,7 @@ class BaseAmoeba {
|
||||
int **firstneigh, bool &success);
|
||||
|
||||
/// Build neighbor list on device
|
||||
void build_nbor_list(const int inum, const int host_inum,
|
||||
int build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, int *nspecial15, tagint **special15,
|
||||
@ -236,6 +236,8 @@ class BaseAmoeba {
|
||||
|
||||
int add_onefive_neighbors();
|
||||
|
||||
UCL_D_Vec<int> dev_short_nbor;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
|
||||
@ -251,8 +253,9 @@ class BaseAmoeba {
|
||||
bool _compiled;
|
||||
int _block_size, _block_bio_size, _threads_per_atom;
|
||||
int _extra_fields;
|
||||
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
|
||||
double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
bool short_nbor_avail;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
|
||||
@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
||||
|
||||
gpu_udirect2b_ready = true;
|
||||
gpu_umutual2b_ready = false;
|
||||
gpu_polar_real_ready = false;
|
||||
gpu_polar_real_ready = true;
|
||||
|
||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user