git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -1,6 +1,6 @@
|
||||
# Settings that the LAMMPS build will import when this package library is used
|
||||
# settings for OpenCL builds
|
||||
gpu_SYSINC =
|
||||
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
|
||||
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
|
||||
gpu_SYSPATH =
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
|
||||
OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
|
||||
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
|
||||
-I$(CUDA_HOME)/include
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
EXTRAMAKE = Makefile.lammps.mingw-cross
|
||||
|
||||
@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
|
||||
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
|
||||
-I../../tools/mingw-cross/mpich2-win32/include/ \
|
||||
-DMPICH_IGNORE_CXX_SEEK
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
||||
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
|
||||
OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
|
||||
-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
|
||||
-I$(CUDA_HOME)/include
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../src/STUBS -lmpi_mingw64
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
||||
@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
|
||||
-I../../tools/mingw-cross/mpich2-win64/include/ \
|
||||
-DMPICH_IGNORE_CXX_SEEK
|
||||
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
||||
@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
gpu_nbor=1;
|
||||
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
||||
if (!success)
|
||||
return NULL;
|
||||
|
||||
nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
// originally the requirement that nall == nlist was enforced
|
||||
// to allow direct indexing neighbors of neighbors after re-arrangement
|
||||
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
|
||||
// now the requirement is removed, allowing to work within pair hybrid
|
||||
nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
#ifdef THREE_CONCURRENT
|
||||
@ -214,7 +220,7 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
@ -230,7 +236,7 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,nlocal,cpu_time);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
ans2->inum(inum);
|
||||
|
||||
@ -205,6 +205,7 @@ class BaseThree {
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
|
||||
int _gpu_nbor;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
@ -127,7 +127,10 @@ void Neighbor::alloc(bool &success) {
|
||||
dev_packed.clear();
|
||||
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
|
||||
_packed_permissions)==UCL_SUCCESS);
|
||||
_c_bytes+=dev_packed.row_bytes();
|
||||
dev_acc.clear();
|
||||
success=success && (dev_acc.alloc(_max_atoms,*dev,
|
||||
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||
_c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
|
||||
}
|
||||
if (_max_host>0) {
|
||||
nbor_host.clear();
|
||||
@ -194,6 +197,7 @@ void Neighbor::clear() {
|
||||
|
||||
host_packed.clear();
|
||||
host_acc.clear();
|
||||
dev_acc.clear();
|
||||
dev_nbor.clear();
|
||||
nbor_host.clear();
|
||||
dev_packed.clear();
|
||||
@ -278,6 +282,15 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
|
||||
UCL_D_Vec<int> acc_view;
|
||||
acc_view.view_offset(inum,dev_nbor,inum*2);
|
||||
ucl_copy(acc_view,host_acc,true);
|
||||
|
||||
UCL_H_Vec<int> host_view;
|
||||
host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
|
||||
for (int ii=0; ii<inum; ii++) {
|
||||
int i=ilist[ii];
|
||||
host_view[i] = ii;
|
||||
}
|
||||
ucl_copy(dev_acc,host_view,true);
|
||||
|
||||
time_nbor.stop();
|
||||
|
||||
if (_use_packing==false) {
|
||||
|
||||
@ -199,6 +199,8 @@ class Neighbor {
|
||||
UCL_H_Vec<int> host_packed;
|
||||
/// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
|
||||
UCL_H_Vec<int> host_acc;
|
||||
/// Device storage for accessing atom indices from the neighbor list (3-body)
|
||||
UCL_D_Vec<int> dev_acc;
|
||||
|
||||
// ----------------- Data for GPU Neighbor Calculation ---------------
|
||||
|
||||
|
||||
@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
#define OCL_DEFAULT_VENDOR "generic"
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_OCL
|
||||
#define OCL_DEFAULT_VENDOR "intel"
|
||||
#endif
|
||||
|
||||
#ifdef PHI_OCL
|
||||
#define OCL_DEFAULT_VENDOR "phi"
|
||||
#endif
|
||||
|
||||
#ifndef OCL_DEFAULT_VENDOR
|
||||
#define OCL_DEFAULT_VENDOR "none"
|
||||
#endif
|
||||
|
||||
@ -196,11 +196,12 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
@ -230,18 +231,21 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
|
||||
@ -195,7 +195,6 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
numtyp sw_powerq=sw2_ijparam.w;
|
||||
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
|
||||
numtyp sw_cut=sw3_ijparam.x;
|
||||
numtyp sw_cutsq=sw3_ijparam.y;
|
||||
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
|
||||
pow(sw_sigma,sw_powerp);
|
||||
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
|
||||
@ -345,7 +344,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
const int t_per_atom, const int evatom) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
@ -394,8 +392,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
@ -467,14 +459,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
@ -522,18 +514,20 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (rsq2 < sw3_ikparam.y) {
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
@ -607,14 +597,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
@ -662,18 +652,20 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (rsq2 < sw3_ikparam.y) {
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
|
||||
@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int eflag, const int nall, const int inum,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
tpa_sq = fast_mul(t_per_atom,t_per_atom);
|
||||
@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<nall) {
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
@ -597,11 +597,12 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -666,13 +667,17 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -818,11 +823,12 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -887,13 +893,17 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
|
||||
@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int eflag, const int nall, const int inum,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
tpa_sq = fast_mul(t_per_atom,t_per_atom);
|
||||
@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<nall) {
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
@ -605,11 +605,12 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
|
||||
@ -676,13 +677,17 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -835,11 +840,12 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
|
||||
@ -906,13 +912,17 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
|
||||
@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int eflag, const int nall, const int inum,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
tpa_sq = fast_mul(t_per_atom,t_per_atom);
|
||||
@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<nall) {
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
@ -617,11 +617,12 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -686,13 +687,17 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -838,11 +843,12 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -907,13 +913,17 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
|
||||
Reference in New Issue
Block a user