Working on short nbor list for the amoeba kernels (based on what has been done with tersoff and ellipsod, nbor dev_packed needs to be allocated properly)

This commit is contained in:
Trung Nguyen
2021-09-10 16:51:16 -05:00
parent a22923aee2
commit 4ebe5833d3
5 changed files with 151 additions and 24 deletions

View File

@ -59,7 +59,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
cell_size,gpu_split,_screen,amoeba,
"k_amoeba_polar", "k_amoeba_udirect2b",
"k_amoeba_umutual2b");
"k_amoeba_umutual2b", "k_amoeba_short_nbor");
if (success!=0)
return success;
@ -157,16 +157,23 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
int AmoebaT::udirect2b(const int eflag, const int vflag) {
int _nall=this->atom->nall();
int nbor_pitch=this->nbor->nbor_pitch();
int ainum=this->ans->inum();
// Compute the block size and grid size to keep all cores busy
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
int _nall=this->atom->nall();
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
int GX;
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
this->k_short_nbor.set_size(GX,BX);
// NOTE: this->nbor->dev_packed is not allocated!!
/*
this->k_short_nbor.run(&this->atom->x, &_off2,
&this->nbor->dev_nbor, &this->nbor->dev_packed,
&ainum, &nbor_pitch, &this->_threads_per_atom);
*/
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
this->k_udirect2b.set_size(GX,BX);
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),

View File

@ -781,8 +781,10 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
numtyp zr = jx.z - ix.z;
numtyp r2 = xr*xr + yr*yr + zr*zr;
if (r2>off2) continue;
if (r2>off2) {
if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj);
continue;
}
numtyp r = ucl_sqrt(r2);
numtyp rinv = ucl_recip(r);
numtyp r2inv = rinv*rinv;
@ -1091,3 +1093,112 @@ __kernel void k_special15(__global int * dev_nbor,
} // if ii
}
/*
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
const numtyp off2, __global int * dev_nbor,
const __global int * dev_packed,
const int inum, const int nbor_pitch,
const int t_per_atom) {
int tid, ii, offset, n_stride, i;
atom_info(t_per_atom,ii,tid,offset);
int new_numj=0;
if (ii<inum) {
int numj, nbor, nbor_end;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
if (i == 0) printf("i = 0: numj before = %d\n", numj);
__global int *out_list=dev_nbor+nbor;
const int out_stride=n_stride;
for ( ; nbor<nbor_end; nbor+=n_stride) {
int sj=dev_packed[nbor];
int j = sj & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<=off2) {
*out_list=sj;
out_list+=out_stride;
new_numj++;
if (i == 0 && offset == 0) printf("neighbor of i = 0 within off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
} else {
if (i == 0 && offset == 0) printf("neighbor of i = 0 outside off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
}
} // for nbor
} // if ii
if (t_per_atom>1) {
for (unsigned int s=t_per_atom/2; s>0; s>>=1)
new_numj += shfl_down(new_numj, s, t_per_atom);
}
if (offset==0 && ii<inum) {
dev_nbor[ii+nbor_pitch]=new_numj;
if (i == 0) printf("i = 0: numj after = %d\n", new_numj);
}
}
*/
#ifdef LAL_SIMD_IP_SYNC
#define t_per_atom t_per_atom_in
#else
#define t_per_atom 1
#endif
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
const numtyp off2,
__global int * dev_nbor,
const __global int * dev_packed,
const int inum, const int nbor_pitch,
const int t_per_atom_in) {
const int ii=GLOBAL_ID_X;
if (ii<inum) {
/*
const int i=dev_packed[ii];
int nbor=ii+nbor_pitch;
const int numj=dev_packed[nbor];
nbor+=nbor_pitch;
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int newj=0;
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
int sj=dev_packed[nbor];
int j = sj & NEIGHMASK15;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
// Compute r12
numtyp delx = ix.x-jx.x;
numtyp dely = ix.y-jx.y;
numtyp delz = ix.z-jx.z;
numtyp rsq = delx*delx+dely*dely+delz*delz;
if (rsq<off2) {
//*out_list=sj;
out_list++;
newj++;
if ((newj & (t_per_atom-1))==0)
out_list+=out_stride;
}
} // for nbor
//dev_nbor[ii+nbor_pitch]=newj;
*/
} // if ii
}

View File

@ -40,6 +40,7 @@ BaseAmoebaT::~BaseAmoeba() {
k_udirect2b.clear();
k_umutual2b.clear();
k_special15.clear();
k_short_nbor.clear();
if (pair_program) delete pair_program;
}
@ -57,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
FILE *_screen, const void *pair_program,
const char *k_name_polar,
const char *k_name_udirect2b,
const char *k_name_umutual2b) {
const char *k_name_umutual2b,
const char *k_name_short_nbor) {
screen=_screen;
int gpu_nbor=0;
@ -89,16 +91,18 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
_block_size=device->pair_block_size();
_block_bio_size=device->block_bio_pair();
compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b);
compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,
k_name_umutual2b,k_name_short_nbor);
if (_threads_per_atom>1 && gpu_nbor==0) {
nbor->packing(true);
_nbor_data=&(nbor->dev_packed);
} else
} else {
_nbor_data=&(nbor->dev_nbor);
}
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
_gpu_host,max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;
@ -223,6 +227,8 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
add_onefive_neighbors();
}
//nbor->copy_unpacked(inum,mn);
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
if (bytes>_max_an_bytes)
_max_an_bytes=bytes;
@ -692,7 +698,8 @@ template <class numtyp, class acctyp>
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
const char *kname_polar,
const char *kname_udirect2b,
const char *kname_umutual2b) {
const char *kname_umutual2b,
const char *kname_short_nbor) {
if (_compiled)
return;
@ -704,6 +711,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
k_polar.set_function(*pair_program,kname_polar);
k_udirect2b.set_function(*pair_program,kname_udirect2b);
k_umutual2b.set_function(*pair_program,kname_umutual2b);
k_short_nbor.set_function(*pair_program,kname_short_nbor);
k_special15.set_function(*pair_program,"k_special15");
pos_tex.get_texture(*pair_program,"pos_tex");
q_tex.get_texture(*pair_program,"q_tex");

View File

@ -55,7 +55,7 @@ class BaseAmoeba {
const int maxspecial, const int maxspecial15, const double cell_size,
const double gpu_split, FILE *screen, const void *pair_program,
const char *kname_polar, const char *kname_udirect2b,
const char *kname_umutual2b);
const char *kname_umutual2b, const char *kname_short_nbor);
/// Estimate the overhead for GPU context changes and CPU driver
void estimate_gpu_overhead(const int add_kernels=0);
@ -239,6 +239,7 @@ class BaseAmoeba {
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
UCL_Kernel k_short_nbor;
inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {}
@ -256,7 +257,7 @@ class BaseAmoeba {
void compile_kernels(UCL_Device &dev, const void *pair_string,
const char *kname_polar, const char *kname_udirect2b,
const char *kname_umutual2b);
const char *kname_umutual2b, const char *kname_short_nbor);
virtual int udirect2b(const int eflag, const int vflag) = 0;
virtual int umutual2b(const int eflag, const int vflag) = 0;

View File

@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
gpu_udirect2b_ready = true;
gpu_umutual2b_ready = false;
gpu_polar_real_ready = true;
gpu_polar_real_ready = false;
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
}