updated
This commit is contained in:
@ -51,16 +51,31 @@ int LJSMOOTHT::init(const int ntypes,
|
|||||||
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
|
double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
|
||||||
double **host_ljsw4,
|
double **host_ljsw4,
|
||||||
double **cut_inner, double **cut_inner_sq) {
|
double **cut_inner, double **cut_inner_sq) {
|
||||||
|
const int max_shared_types=this->device->max_shared_types();
|
||||||
|
|
||||||
|
int onetype=0;
|
||||||
|
#ifdef USE_OPENCL
|
||||||
|
if (maxspecial==0)
|
||||||
|
for (int i=1; i<ntypes; i++)
|
||||||
|
for (int j=i; j<ntypes; j++)
|
||||||
|
if (host_cutsq[i][j]>0) {
|
||||||
|
if (onetype>0)
|
||||||
|
onetype=-1;
|
||||||
|
else if (onetype==0)
|
||||||
|
onetype=i*max_shared_types+j;
|
||||||
|
}
|
||||||
|
if (onetype<0) onetype=0;
|
||||||
|
#endif
|
||||||
|
|
||||||
int success;
|
int success;
|
||||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||||
_screen,lj_smooth,"k_lj_smooth");
|
_screen,lj_smooth,"k_lj_smooth",onetype);
|
||||||
if (success!=0)
|
if (success!=0)
|
||||||
return success;
|
return success;
|
||||||
|
|
||||||
// If atom type constants fit in shared memory use fast kernel
|
// If atom type constants fit in shared memory use fast kernel
|
||||||
int lj_types=ntypes;
|
int lj_types=ntypes;
|
||||||
shared_types=false;
|
shared_types=false;
|
||||||
int max_shared_types=this->device->max_shared_types();
|
|
||||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||||
lj_types=max_shared_types;
|
lj_types=max_shared_types;
|
||||||
shared_types=true;
|
shared_types=true;
|
||||||
@ -145,19 +160,9 @@ double LJSMOOTHT::host_memory_usage() const {
|
|||||||
// Calculate energies, forces, and torques
|
// Calculate energies, forces, and torques
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
int LJSMOOTHT::loop(const int _eflag, const int _vflag) {
|
int LJSMOOTHT::loop(const int eflag, const int vflag) {
|
||||||
// Compute the block size and grid size to keep all cores busy
|
// Compute the block size and grid size to keep all cores busy
|
||||||
const int BX=this->block_size();
|
const int BX=this->block_size();
|
||||||
int eflag, vflag;
|
|
||||||
if (_eflag)
|
|
||||||
eflag=1;
|
|
||||||
else
|
|
||||||
eflag=0;
|
|
||||||
|
|
||||||
if (_vflag)
|
|
||||||
vflag=1;
|
|
||||||
else
|
|
||||||
vflag=0;
|
|
||||||
|
|
||||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||||
(BX/this->_threads_per_atom)));
|
(BX/this->_threads_per_atom)));
|
||||||
@ -166,8 +171,8 @@ int LJSMOOTHT::loop(const int _eflag, const int _vflag) {
|
|||||||
int nbor_pitch=this->nbor->nbor_pitch();
|
int nbor_pitch=this->nbor->nbor_pitch();
|
||||||
this->time_pair.start();
|
this->time_pair.start();
|
||||||
if (shared_types) {
|
if (shared_types) {
|
||||||
this->k_pair_fast.set_size(GX,BX);
|
this->k_pair_sel->set_size(GX,BX);
|
||||||
this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &sp_lj,
|
this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &sp_lj,
|
||||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||||
&this->ans->force, &this->ans->engv, &eflag,
|
&this->ans->force, &this->ans->engv, &eflag,
|
||||||
&vflag, &ainum, &nbor_pitch,
|
&vflag, &ainum, &nbor_pitch,
|
||||||
@ -180,6 +185,7 @@ int LJSMOOTHT::loop(const int _eflag, const int _vflag) {
|
|||||||
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
&ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||||
}
|
}
|
||||||
this->time_pair.stop();
|
this->time_pair.stop();
|
||||||
|
return GX;
|
||||||
}
|
}
|
||||||
|
|
||||||
template class LJSMOOTH<PRECISION,ACC_PRECISION>;
|
template class LJSMOOTH<PRECISION,ACC_PRECISION>;
|
||||||
|
|||||||
@ -40,16 +40,20 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
|
|||||||
int tid, ii, offset;
|
int tid, ii, offset;
|
||||||
atom_info(t_per_atom,ii,tid,offset);
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
int n_stride;
|
||||||
|
local_allocate_store_pair();
|
||||||
|
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp virial[6];
|
||||||
for (int i=0; i<6; i++)
|
acctyp energy, virial[6];
|
||||||
virial[i]=(acctyp)0;
|
if (EVFLAG) {
|
||||||
|
energy=(acctyp)0;
|
||||||
|
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||||
|
}
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor, nbor_end;
|
int i, numj, nbor, nbor_end;
|
||||||
__local int n_stride;
|
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -96,7 +100,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
|
|||||||
f.y+=dely*force;
|
f.y+=dely*force;
|
||||||
f.z+=delz*force;
|
f.z+=delz*force;
|
||||||
|
|
||||||
if (eflag>0) {
|
if (EVFLAG && eflag) {
|
||||||
numtyp e;
|
numtyp e;
|
||||||
if (rsq < lj1[mtype].w)
|
if (rsq < lj1[mtype].w)
|
||||||
e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
|
e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
|
||||||
@ -108,7 +112,7 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
|
|||||||
//numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
//numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||||
energy+=factor_lj*e;
|
energy+=factor_lj*e;
|
||||||
}
|
}
|
||||||
if (vflag>0) {
|
if (EVFLAG && vflag) {
|
||||||
virial[0] += delx*delx*force;
|
virial[0] += delx*delx*force;
|
||||||
virial[1] += dely*dely*force;
|
virial[1] += dely*dely*force;
|
||||||
virial[2] += delz*delz*force;
|
virial[2] += delz*delz*force;
|
||||||
@ -119,9 +123,9 @@ __kernel void k_lj_smooth(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
|
||||||
ans,engv);
|
|
||||||
} // if ii
|
} // if ii
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
__kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
||||||
@ -139,6 +143,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
|||||||
int tid, ii, offset;
|
int tid, ii, offset;
|
||||||
atom_info(t_per_atom,ii,tid,offset);
|
atom_info(t_per_atom,ii,tid,offset);
|
||||||
|
|
||||||
|
#ifndef ONETYPE
|
||||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||||
__local numtyp sp_lj[4];
|
__local numtyp sp_lj[4];
|
||||||
@ -146,40 +151,60 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
|||||||
sp_lj[tid]=sp_lj_in[tid];
|
sp_lj[tid]=sp_lj_in[tid];
|
||||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||||
lj1[tid]=lj1_in[tid];
|
lj1[tid]=lj1_in[tid];
|
||||||
if (eflag>0)
|
if (EVFLAG && eflag)
|
||||||
lj3[tid]=lj3_in[tid];
|
lj3[tid]=lj3_in[tid];
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
#else
|
||||||
|
const numtyp lj1x=lj1_in[ONETYPE].x;
|
||||||
|
const numtyp lj1y=lj1_in[ONETYPE].y;
|
||||||
|
const numtyp cutsq=lj1_in[ONETYPE].z;
|
||||||
|
numtyp lj3x, lj3y, lj3z;
|
||||||
|
if (EVFLAG && eflag) {
|
||||||
|
lj3x=lj3_in[ONETYPE].x;
|
||||||
|
lj3y=lj3_in[ONETYPE].y;
|
||||||
|
lj3z=lj3_in[ONETYPE].z;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int n_stride;
|
||||||
|
local_allocate_store_pair();
|
||||||
|
|
||||||
acctyp energy=(acctyp)0;
|
|
||||||
acctyp4 f;
|
acctyp4 f;
|
||||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||||
acctyp virial[6];
|
acctyp energy, virial[6];
|
||||||
for (int i=0; i<6; i++)
|
if (EVFLAG) {
|
||||||
virial[i]=(acctyp)0;
|
energy=(acctyp)0;
|
||||||
|
for (int i=0; i<6; i++) virial[i]=(acctyp)0;
|
||||||
__syncthreads();
|
}
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor, nbor_end;
|
int i, numj, nbor, nbor_end;
|
||||||
__local int n_stride;
|
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||||
|
#ifndef ONETYPE
|
||||||
int iw=ix.w;
|
int iw=ix.w;
|
||||||
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
|
||||||
|
|
||||||
numtyp force, r6inv, factor_lj, forcelj;
|
numtyp force, r6inv, factor_lj, forcelj;
|
||||||
numtyp r, t, tsq, fskin;
|
numtyp r, t, tsq, fskin;
|
||||||
|
|
||||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
#endif
|
||||||
|
|
||||||
|
NOUNROLL
|
||||||
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
int j=dev_packed[nbor];
|
int j=dev_packed[nbor];
|
||||||
|
#ifndef ONETYPE
|
||||||
factor_lj = sp_lj[sbmask(j)];
|
factor_lj = sp_lj[sbmask(j)];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
#endif
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
|
#ifndef ONETYPE
|
||||||
int mtype=itype+jx.w;
|
int mtype=itype+jx.w;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Compute r12
|
// Compute r12
|
||||||
numtyp delx = ix.x-jx.x;
|
numtyp delx = ix.x-jx.x;
|
||||||
@ -207,7 +232,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
|||||||
f.y+=dely*force;
|
f.y+=dely*force;
|
||||||
f.z+=delz*force;
|
f.z+=delz*force;
|
||||||
|
|
||||||
if (eflag>0) {
|
if (EVFLAG && eflag) {
|
||||||
numtyp e;
|
numtyp e;
|
||||||
if (rsq < lj1[mtype].w)
|
if (rsq < lj1[mtype].w)
|
||||||
e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
|
e = r6inv * (lj3[mtype].x*r6inv - lj3[mtype].y) - lj3[mtype].z;
|
||||||
@ -218,7 +243,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
energy+=factor_lj*e;
|
energy+=factor_lj*e;
|
||||||
}
|
}
|
||||||
if (vflag>0) {
|
if (EVFLAG && vflag) {
|
||||||
virial[0] += delx*delx*force;
|
virial[0] += delx*delx*force;
|
||||||
virial[1] += dely*dely*force;
|
virial[1] += dely*dely*force;
|
||||||
virial[2] += delz*delz*force;
|
virial[2] += delz*delz*force;
|
||||||
@ -229,7 +254,7 @@ __kernel void k_lj_smooth_fast(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // for nbor
|
} // for nbor
|
||||||
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
|
||||||
ans,engv);
|
|
||||||
} // if ii
|
} // if ii
|
||||||
|
store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
|
||||||
|
ans,engv);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -57,7 +57,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
int init_ok=0;
|
int init_ok=0;
|
||||||
if (world_me==0)
|
if (world_me==0)
|
||||||
init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||||
host_lj4, offset, special_lj, inum, nall, 300,
|
host_lj4, offset, special_lj, inum, nall, max_nbors,
|
||||||
maxspecial, cell_size, gpu_split, screen,
|
maxspecial, cell_size, gpu_split, screen,
|
||||||
host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
|
host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq);
|
||||||
|
|
||||||
@ -76,7 +76,7 @@ int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
|||||||
}
|
}
|
||||||
if (gpu_rank==i && world_me!=0)
|
if (gpu_rank==i && world_me!=0)
|
||||||
init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||||
offset, special_lj, inum, nall, 300, maxspecial,
|
offset, special_lj, inum, nall, max_nbors, maxspecial,
|
||||||
cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3,
|
cell_size, gpu_split, screen, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3,
|
||||||
host_ljsw4, cut_inner, cut_inner_sq);
|
host_ljsw4, cut_inner, cut_inner_sq);
|
||||||
|
|
||||||
|
|||||||
@ -171,9 +171,10 @@ void PairLJSmoothGPU::init_style()
|
|||||||
int maxspecial=0;
|
int maxspecial=0;
|
||||||
if (atom->molecular)
|
if (atom->molecular)
|
||||||
maxspecial=atom->maxspecial;
|
maxspecial=atom->maxspecial;
|
||||||
|
int mnf = 5e-2 * neighbor->oneatom;
|
||||||
int success = ljsmt_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
|
int success = ljsmt_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
|
||||||
offset, force->special_lj, atom->nlocal,
|
offset, force->special_lj, atom->nlocal,
|
||||||
atom->nlocal+atom->nghost, 300, maxspecial,
|
atom->nlocal+atom->nghost, mnf, maxspecial,
|
||||||
cell_size, gpu_mode, screen, ljsw0, ljsw1, ljsw2,
|
cell_size, gpu_mode, screen, ljsw0, ljsw1, ljsw2,
|
||||||
ljsw3, ljsw4, cut_inner, cut_inner_sq);
|
ljsw3, ljsw4, cut_inner, cut_inner_sq);
|
||||||
GPU_EXTRA::check_flag(success,error,world);
|
GPU_EXTRA::check_flag(success,error,world);
|
||||||
|
|||||||
Reference in New Issue
Block a user