Working on short nbor list for the amoeba kernels (based on what has been done with tersoff and ellipsod, nbor dev_packed needs to be allocated properly)
This commit is contained in:
@ -59,7 +59,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
|
|||||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
|
||||||
cell_size,gpu_split,_screen,amoeba,
|
cell_size,gpu_split,_screen,amoeba,
|
||||||
"k_amoeba_polar", "k_amoeba_udirect2b",
|
"k_amoeba_polar", "k_amoeba_udirect2b",
|
||||||
"k_amoeba_umutual2b");
|
"k_amoeba_umutual2b", "k_amoeba_short_nbor");
|
||||||
if (success!=0)
|
if (success!=0)
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
@ -157,16 +157,23 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int AmoebaT::udirect2b(const int eflag, const int vflag) {
|
int AmoebaT::udirect2b(const int eflag, const int vflag) {
|
||||||
|
int _nall=this->atom->nall();
|
||||||
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
|
int ainum=this->ans->inum();
|
||||||
|
|
||||||
// Compute the block size and grid size to keep all cores busy
|
// Compute the block size and grid size to keep all cores busy
|
||||||
const int BX=this->block_size();
|
const int BX=this->block_size();
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
int GX;
|
||||||
(BX/this->_threads_per_atom)));
|
|
||||||
|
|
||||||
int _nall=this->atom->nall();
|
|
||||||
int ainum=this->ans->inum();
|
|
||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
|
||||||
this->time_pair.start();
|
|
||||||
|
|
||||||
|
GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
|
||||||
|
this->k_short_nbor.set_size(GX,BX);
|
||||||
|
// NOTE: this->nbor->dev_packed is not allocated!!
|
||||||
|
/*
|
||||||
|
this->k_short_nbor.run(&this->atom->x, &_off2,
|
||||||
|
&this->nbor->dev_nbor, &this->nbor->dev_packed,
|
||||||
|
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||||
|
*/
|
||||||
|
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
|
||||||
this->k_udirect2b.set_size(GX,BX);
|
this->k_udirect2b.set_size(GX,BX);
|
||||||
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
|
this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
|
||||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
|
|||||||
@ -781,8 +781,10 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
|
|||||||
numtyp zr = jx.z - ix.z;
|
numtyp zr = jx.z - ix.z;
|
||||||
numtyp r2 = xr*xr + yr*yr + zr*zr;
|
numtyp r2 = xr*xr + yr*yr + zr*zr;
|
||||||
|
|
||||||
if (r2>off2) continue;
|
if (r2>off2) {
|
||||||
|
if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
numtyp r = ucl_sqrt(r2);
|
numtyp r = ucl_sqrt(r2);
|
||||||
numtyp rinv = ucl_recip(r);
|
numtyp rinv = ucl_recip(r);
|
||||||
numtyp r2inv = rinv*rinv;
|
numtyp r2inv = rinv*rinv;
|
||||||
@ -1091,3 +1093,112 @@ __kernel void k_special15(__global int * dev_nbor,
|
|||||||
|
|
||||||
} // if ii
|
} // if ii
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||||
|
const numtyp off2, __global int * dev_nbor,
|
||||||
|
const __global int * dev_packed,
|
||||||
|
const int inum, const int nbor_pitch,
|
||||||
|
const int t_per_atom) {
|
||||||
|
int tid, ii, offset, n_stride, i;
|
||||||
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
int new_numj=0;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
int numj, nbor, nbor_end;
|
||||||
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
if (i == 0) printf("i = 0: numj before = %d\n", numj);
|
||||||
|
__global int *out_list=dev_nbor+nbor;
|
||||||
|
const int out_stride=n_stride;
|
||||||
|
|
||||||
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
|
int sj=dev_packed[nbor];
|
||||||
|
int j = sj & NEIGHMASK15;
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<=off2) {
|
||||||
|
*out_list=sj;
|
||||||
|
out_list+=out_stride;
|
||||||
|
|
||||||
|
new_numj++;
|
||||||
|
if (i == 0 && offset == 0) printf("neighbor of i = 0 within off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
|
||||||
|
} else {
|
||||||
|
if (i == 0 && offset == 0) printf("neighbor of i = 0 outside off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
|
||||||
|
}
|
||||||
|
} // for nbor
|
||||||
|
} // if ii
|
||||||
|
|
||||||
|
if (t_per_atom>1) {
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1)
|
||||||
|
new_numj += shfl_down(new_numj, s, t_per_atom);
|
||||||
|
}
|
||||||
|
if (offset==0 && ii<inum) {
|
||||||
|
dev_nbor[ii+nbor_pitch]=new_numj;
|
||||||
|
if (i == 0) printf("i = 0: numj after = %d\n", new_numj);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
#ifdef LAL_SIMD_IP_SYNC
|
||||||
|
#define t_per_atom t_per_atom_in
|
||||||
|
#else
|
||||||
|
#define t_per_atom 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
|
||||||
|
const numtyp off2,
|
||||||
|
__global int * dev_nbor,
|
||||||
|
const __global int * dev_packed,
|
||||||
|
const int inum, const int nbor_pitch,
|
||||||
|
const int t_per_atom_in) {
|
||||||
|
const int ii=GLOBAL_ID_X;
|
||||||
|
|
||||||
|
if (ii<inum) {
|
||||||
|
/*
|
||||||
|
const int i=dev_packed[ii];
|
||||||
|
|
||||||
|
int nbor=ii+nbor_pitch;
|
||||||
|
const int numj=dev_packed[nbor];
|
||||||
|
nbor+=nbor_pitch;
|
||||||
|
const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
|
||||||
|
|
||||||
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
int newj=0;
|
||||||
|
|
||||||
|
__global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
|
||||||
|
const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
|
||||||
|
|
||||||
|
for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
|
||||||
|
int sj=dev_packed[nbor];
|
||||||
|
int j = sj & NEIGHMASK15;
|
||||||
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
|
||||||
|
// Compute r12
|
||||||
|
numtyp delx = ix.x-jx.x;
|
||||||
|
numtyp dely = ix.y-jx.y;
|
||||||
|
numtyp delz = ix.z-jx.z;
|
||||||
|
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||||
|
|
||||||
|
if (rsq<off2) {
|
||||||
|
//*out_list=sj;
|
||||||
|
out_list++;
|
||||||
|
newj++;
|
||||||
|
if ((newj & (t_per_atom-1))==0)
|
||||||
|
out_list+=out_stride;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // for nbor
|
||||||
|
//dev_nbor[ii+nbor_pitch]=newj;
|
||||||
|
*/
|
||||||
|
} // if ii
|
||||||
|
}
|
||||||
@ -40,6 +40,7 @@ BaseAmoebaT::~BaseAmoeba() {
|
|||||||
k_udirect2b.clear();
|
k_udirect2b.clear();
|
||||||
k_umutual2b.clear();
|
k_umutual2b.clear();
|
||||||
k_special15.clear();
|
k_special15.clear();
|
||||||
|
k_short_nbor.clear();
|
||||||
if (pair_program) delete pair_program;
|
if (pair_program) delete pair_program;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
|||||||
FILE *_screen, const void *pair_program,
|
FILE *_screen, const void *pair_program,
|
||||||
const char *k_name_polar,
|
const char *k_name_polar,
|
||||||
const char *k_name_udirect2b,
|
const char *k_name_udirect2b,
|
||||||
const char *k_name_umutual2b) {
|
const char *k_name_umutual2b,
|
||||||
|
const char *k_name_short_nbor) {
|
||||||
screen=_screen;
|
screen=_screen;
|
||||||
|
|
||||||
int gpu_nbor=0;
|
int gpu_nbor=0;
|
||||||
@ -89,16 +91,18 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
|||||||
|
|
||||||
_block_size=device->pair_block_size();
|
_block_size=device->pair_block_size();
|
||||||
_block_bio_size=device->block_bio_pair();
|
_block_bio_size=device->block_bio_pair();
|
||||||
compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b);
|
compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,
|
||||||
|
k_name_umutual2b,k_name_short_nbor);
|
||||||
|
|
||||||
if (_threads_per_atom>1 && gpu_nbor==0) {
|
if (_threads_per_atom>1 && gpu_nbor==0) {
|
||||||
nbor->packing(true);
|
nbor->packing(true);
|
||||||
_nbor_data=&(nbor->dev_packed);
|
_nbor_data=&(nbor->dev_packed);
|
||||||
} else
|
} else {
|
||||||
_nbor_data=&(nbor->dev_nbor);
|
_nbor_data=&(nbor->dev_nbor);
|
||||||
|
}
|
||||||
|
|
||||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
|
||||||
max_nbors,cell_size,false,_threads_per_atom);
|
_gpu_host,max_nbors,cell_size,false,_threads_per_atom);
|
||||||
if (success!=0)
|
if (success!=0)
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
@ -223,6 +227,8 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
|
|||||||
add_onefive_neighbors();
|
add_onefive_neighbors();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//nbor->copy_unpacked(inum,mn);
|
||||||
|
|
||||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||||
if (bytes>_max_an_bytes)
|
if (bytes>_max_an_bytes)
|
||||||
_max_an_bytes=bytes;
|
_max_an_bytes=bytes;
|
||||||
@ -692,7 +698,8 @@ template <class numtyp, class acctyp>
|
|||||||
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||||
const char *kname_polar,
|
const char *kname_polar,
|
||||||
const char *kname_udirect2b,
|
const char *kname_udirect2b,
|
||||||
const char *kname_umutual2b) {
|
const char *kname_umutual2b,
|
||||||
|
const char *kname_short_nbor) {
|
||||||
if (_compiled)
|
if (_compiled)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -704,6 +711,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
|||||||
k_polar.set_function(*pair_program,kname_polar);
|
k_polar.set_function(*pair_program,kname_polar);
|
||||||
k_udirect2b.set_function(*pair_program,kname_udirect2b);
|
k_udirect2b.set_function(*pair_program,kname_udirect2b);
|
||||||
k_umutual2b.set_function(*pair_program,kname_umutual2b);
|
k_umutual2b.set_function(*pair_program,kname_umutual2b);
|
||||||
|
k_short_nbor.set_function(*pair_program,kname_short_nbor);
|
||||||
k_special15.set_function(*pair_program,"k_special15");
|
k_special15.set_function(*pair_program,"k_special15");
|
||||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||||
q_tex.get_texture(*pair_program,"q_tex");
|
q_tex.get_texture(*pair_program,"q_tex");
|
||||||
|
|||||||
@ -55,7 +55,7 @@ class BaseAmoeba {
|
|||||||
const int maxspecial, const int maxspecial15, const double cell_size,
|
const int maxspecial, const int maxspecial15, const double cell_size,
|
||||||
const double gpu_split, FILE *screen, const void *pair_program,
|
const double gpu_split, FILE *screen, const void *pair_program,
|
||||||
const char *kname_polar, const char *kname_udirect2b,
|
const char *kname_polar, const char *kname_udirect2b,
|
||||||
const char *kname_umutual2b);
|
const char *kname_umutual2b, const char *kname_short_nbor);
|
||||||
|
|
||||||
/// Estimate the overhead for GPU context changes and CPU driver
|
/// Estimate the overhead for GPU context changes and CPU driver
|
||||||
void estimate_gpu_overhead(const int add_kernels=0);
|
void estimate_gpu_overhead(const int add_kernels=0);
|
||||||
@ -239,6 +239,7 @@ class BaseAmoeba {
|
|||||||
// ------------------------- DEVICE KERNELS -------------------------
|
// ------------------------- DEVICE KERNELS -------------------------
|
||||||
UCL_Program *pair_program;
|
UCL_Program *pair_program;
|
||||||
UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
|
UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
|
||||||
|
UCL_Kernel k_short_nbor;
|
||||||
inline int block_size() { return _block_size; }
|
inline int block_size() { return _block_size; }
|
||||||
inline void set_kernel(const int eflag, const int vflag) {}
|
inline void set_kernel(const int eflag, const int vflag) {}
|
||||||
|
|
||||||
@ -256,7 +257,7 @@ class BaseAmoeba {
|
|||||||
|
|
||||||
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
void compile_kernels(UCL_Device &dev, const void *pair_string,
|
||||||
const char *kname_polar, const char *kname_udirect2b,
|
const char *kname_polar, const char *kname_udirect2b,
|
||||||
const char *kname_umutual2b);
|
const char *kname_umutual2b, const char *kname_short_nbor);
|
||||||
|
|
||||||
virtual int udirect2b(const int eflag, const int vflag) = 0;
|
virtual int udirect2b(const int eflag, const int vflag) = 0;
|
||||||
virtual int umutual2b(const int eflag, const int vflag) = 0;
|
virtual int umutual2b(const int eflag, const int vflag) = 0;
|
||||||
|
|||||||
@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
|
|||||||
|
|
||||||
gpu_udirect2b_ready = true;
|
gpu_udirect2b_ready = true;
|
||||||
gpu_umutual2b_ready = false;
|
gpu_umutual2b_ready = false;
|
||||||
gpu_polar_real_ready = true;
|
gpu_polar_real_ready = false;
|
||||||
|
|
||||||
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user