git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2016-07-01 23:27:26 +00:00
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions

View File

@ -1,6 +1,6 @@
# Settings that the LAMMPS build will import when this package library is used
# settings for OpenCL builds
gpu_SYSINC =
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
gpu_SYSPATH =

View File

@ -7,7 +7,7 @@
EXTRAMAKE = Makefile.lammps.standard
ifeq($(CUDA_HOME),)
ifeq ($(CUDA_HOME),)
CUDA_HOME = /usr/local/cuda
endif

View File

@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-I$(CUDA_HOME)/include
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL
EXTRAMAKE = Makefile.lammps.mingw-cross

View File

@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
-I../../tools/mingw-cross/mpich2-win32/include/ \
-DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL

View File

@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-I$(CUDA_HOME)/include
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../src/STUBS -lmpi_mingw64
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL

View File

@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-I../../tools/mingw-cross/mpich2-win64/include/ \
-DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL

View File

@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
_gpu_nbor=gpu_nbor;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
if (!success)
return NULL;
nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
// originally the requirement that nall == nlist was enforced
// to allow direct indexing neighbors of neighbors after re-arrangement
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
// now the requirement is removed, allowing to work within pair hybrid
nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT
@ -214,7 +220,7 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
@ -230,7 +236,7 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,nlocal,cpu_time);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
#ifdef THREE_CONCURRENT
ans2->inum(inum);

View File

@ -205,6 +205,7 @@ class BaseThree {
protected:
bool _compiled;
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;

View File

@ -127,7 +127,10 @@ void Neighbor::alloc(bool &success) {
dev_packed.clear();
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
_packed_permissions)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes();
dev_acc.clear();
success=success && (dev_acc.alloc(_max_atoms,*dev,
UCL_READ_WRITE)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
}
if (_max_host>0) {
nbor_host.clear();
@ -194,6 +197,7 @@ void Neighbor::clear() {
host_packed.clear();
host_acc.clear();
dev_acc.clear();
dev_nbor.clear();
nbor_host.clear();
dev_packed.clear();
@ -278,6 +282,15 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
UCL_H_Vec<int> host_view;
host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
for (int ii=0; ii<inum; ii++) {
int i=ilist[ii];
host_view[i] = ii;
}
ucl_copy(dev_acc,host_view,true);
time_nbor.stop();
if (_use_packing==false) {

View File

@ -199,6 +199,8 @@ class Neighbor {
UCL_H_Vec<int> host_packed;
/// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
UCL_H_Vec<int> host_acc;
/// Device storage for accessing atom indices from the neighbor list (3-body)
UCL_D_Vec<int> dev_acc;
// ----------------- Data for GPU Neighbor Calculation ---------------

View File

@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#define OCL_DEFAULT_VENDOR "generic"
#endif
#ifdef INTEL_OCL
#define OCL_DEFAULT_VENDOR "intel"
#endif
#ifdef PHI_OCL
#define OCL_DEFAULT_VENDOR "phi"
#endif
#ifndef OCL_DEFAULT_VENDOR
#define OCL_DEFAULT_VENDOR "none"
#endif

View File

@ -196,11 +196,12 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
@ -230,18 +231,21 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();
}

View File

@ -195,7 +195,6 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
numtyp sw_powerq=sw2_ijparam.w;
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
numtyp sw_cut=sw3_ijparam.x;
numtyp sw_cutsq=sw3_ijparam.y;
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
@ -345,7 +344,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const int t_per_atom, const int evatom) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -394,8 +392,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -467,14 +459,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -522,18 +514,20 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -607,14 +597,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -662,18 +652,20 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);

View File

@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
else
_eflag=0;
int ainum=nall;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();

View File

@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const int eflag, const int nall, const int inum,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<nall) {
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
@ -597,11 +597,12 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -666,13 +667,17 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -818,11 +823,12 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -887,13 +893,17 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;

View File

@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
else
_eflag=0;
int ainum=nall;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();

View File

@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const int eflag, const int nall, const int inum,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<nall) {
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
@ -605,11 +605,12 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -676,13 +677,17 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -835,11 +840,12 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -906,13 +912,17 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;

View File

@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
else
_eflag=0;
int ainum=nall;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();

View File

@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const int eflag, const int nall, const int inum,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<nall) {
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
@ -617,11 +617,12 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -686,13 +687,17 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -838,11 +843,12 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -907,13 +913,17 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;