Working on short nbor list for the amoeba kernels (based on what has been done with tersoff and ellipsod, nbor dev_packed needs to be allocated properly)
This commit is contained in:
@ -59,7 +59,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||
cell_size,gpu_split,_screen,amoeba,
|
||||
"k_amoeba_polar", "k_amoeba_udirect2b",
|
||||
"k_amoeba_umutual2b");
|
||||
"k_amoeba_umutual2b", "k_amoeba_short_nbor");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -157,16 +157,23 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
int AmoebaT::udirect2b(const int eflag, const int vflag) {
|
||||
int _nall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int ainum=this->ans->inum();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int _nall=this->atom->nall();
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
int GX;
|
||||
|
||||
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
// NOTE: this->nbor->dev_packed is not allocated!!
|
||||
/*
|
||||
this->k_short_nbor.run(&this->atom->x, &_off2,
|
||||
&this->nbor->dev_nbor, &this->nbor->dev_packed,
|
||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
*/
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
|
||||
this->k_udirect2b.set_size(GX,BX);
|
||||
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
|
||||
@ -781,8 +781,10 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
||||
numtyp zr = jx.z - ix.z;
|
||||
numtyp r2 = xr*xr + yr*yr + zr*zr;
|
||||
|
||||
if (r2>off2) continue;
|
||||
|
||||
if (r2>off2) {
|
||||
if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj);
|
||||
continue;
|
||||
}
|
||||
numtyp r = ucl_sqrt(r2);
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp r2inv = rinv*rinv;
|
||||
@ -1091,3 +1093,112 @@ __kernel void k_special15(__global int * dev_nbor,
|
||||
|
||||
} // if ii
|
||||
}
|
||||
|
||||
/*
|
||||
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const numtyp off2, __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
int tid, ii, offset, n_stride, i;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
|
||||
int new_numj=0;
|
||||
|
||||
if (ii<inum) {
|
||||
int numj, nbor, nbor_end;
|
||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
if (i == 0) printf("i = 0: numj before = %d\n", numj);
|
||||
__global int *out_list=dev_nbor+nbor;
|
||||
const int out_stride=n_stride;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
|
||||
int sj=dev_packed[nbor];
|
||||
int j = sj & NEIGHMASK15;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<=off2) {
|
||||
*out_list=sj;
|
||||
out_list+=out_stride;
|
||||
|
||||
new_numj++;
|
||||
if (i == 0 && offset == 0) printf("neighbor of i = 0 within off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
|
||||
} else {
|
||||
if (i == 0 && offset == 0) printf("neighbor of i = 0 outside off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
|
||||
}
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
if (t_per_atom>1) {
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1)
|
||||
new_numj += shfl_down(new_numj, s, t_per_atom);
|
||||
}
|
||||
if (offset==0 && ii<inum) {
|
||||
dev_nbor[ii+nbor_pitch]=new_numj;
|
||||
if (i == 0) printf("i = 0: numj after = %d\n", new_numj);
|
||||
}
|
||||
}
|
||||
*/
|
||||
#ifdef LAL_SIMD_IP_SYNC
|
||||
#define t_per_atom t_per_atom_in
|
||||
#else
|
||||
#define t_per_atom 1
|
||||
#endif
|
||||
|
||||
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const numtyp off2,
|
||||
__global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom_in) {
|
||||
const int ii=GLOBAL_ID_X;
|
||||
|
||||
if (ii<inum) {
|
||||
/*
|
||||
const int i=dev_packed[ii];
|
||||
|
||||
int nbor=ii+nbor_pitch;
|
||||
const int numj=dev_packed[nbor];
|
||||
nbor+=nbor_pitch;
|
||||
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int newj=0;
|
||||
|
||||
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
|
||||
const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
|
||||
|
||||
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||
int sj=dev_packed[nbor];
|
||||
int j = sj & NEIGHMASK15;
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<off2) {
|
||||
//*out_list=sj;
|
||||
out_list++;
|
||||
newj++;
|
||||
if ((newj & (t_per_atom-1))==0)
|
||||
out_list+=out_stride;
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
//dev_nbor[ii+nbor_pitch]=newj;
|
||||
*/
|
||||
} // if ii
|
||||
}
|
||||
@ -40,6 +40,7 @@ BaseAmoebaT::~BaseAmoeba() {
|
||||
k_udirect2b.clear();
|
||||
k_umutual2b.clear();
|
||||
k_special15.clear();
|
||||
k_short_nbor.clear();
|
||||
if (pair_program) delete pair_program;
|
||||
}
|
||||
|
||||
@ -57,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
FILE *_screen, const void *pair_program,
|
||||
const char *k_name_polar,
|
||||
const char *k_name_udirect2b,
|
||||
const char *k_name_umutual2b) {
|
||||
const char *k_name_umutual2b,
|
||||
const char *k_name_short_nbor) {
|
||||
screen=_screen;
|
||||
|
||||
int gpu_nbor=0;
|
||||
@ -89,16 +91,18 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
|
||||
_block_size=device->pair_block_size();
|
||||
_block_bio_size=device->block_bio_pair();
|
||||
compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b);
|
||||
compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,
|
||||
k_name_umutual2b,k_name_short_nbor);
|
||||
|
||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||
nbor->packing(true);
|
||||
_nbor_data=&(nbor->dev_packed);
|
||||
} else
|
||||
} else {
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
}
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
|
||||
_gpu_host,max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
@ -223,6 +227,8 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
||||
add_onefive_neighbors();
|
||||
}
|
||||
|
||||
//nbor->copy_unpacked(inum,mn);
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
@ -692,7 +698,8 @@ template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
const char *kname_polar,
|
||||
const char *kname_udirect2b,
|
||||
const char *kname_umutual2b) {
|
||||
const char *kname_umutual2b,
|
||||
const char *kname_short_nbor) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
@ -704,6 +711,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
k_polar.set_function(*pair_program,kname_polar);
|
||||
k_udirect2b.set_function(*pair_program,kname_udirect2b);
|
||||
k_umutual2b.set_function(*pair_program,kname_umutual2b);
|
||||
k_short_nbor.set_function(*pair_program,kname_short_nbor);
|
||||
k_special15.set_function(*pair_program,"k_special15");
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
q_tex.get_texture(*pair_program,"q_tex");
|
||||
|
||||
@ -55,7 +55,7 @@ class BaseAmoeba {
|
||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||
const double gpu_split, FILE *screen, const void *pair_program,
|
||||
const char *kname_polar, const char *kname_udirect2b,
|
||||
const char *kname_umutual2b);
|
||||
const char *kname_umutual2b, const char *kname_short_nbor);
|
||||
|
||||
/// Estimate the overhead for GPU context changes and CPU driver
|
||||
void estimate_gpu_overhead(const int add_kernels=0);
|
||||
@ -239,6 +239,7 @@ class BaseAmoeba {
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
|
||||
UCL_Kernel k_short_nbor;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {}
|
||||
|
||||
@ -256,7 +257,7 @@ class BaseAmoeba {
|
||||
|
||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||
const char *kname_polar, const char *kname_udirect2b,
|
||||
const char *kname_umutual2b);
|
||||
const char *kname_umutual2b, const char *kname_short_nbor);
|
||||
|
||||
virtual int udirect2b(const int eflag, const int vflag) = 0;
|
||||
virtual int umutual2b(const int eflag, const int vflag) = 0;
|
||||
|
||||
@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
||||
|
||||
gpu_udirect2b_ready = true;
|
||||
gpu_umutual2b_ready = false;
|
||||
gpu_polar_real_ready = true;
|
||||
gpu_polar_real_ready = false;
|
||||
|
||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user