git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -1,6 +1,6 @@
|
||||
# Settings that the LAMMPS build will import when this package library is used
|
||||
# settings for OpenCL builds
|
||||
gpu_SYSINC =
|
||||
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
|
||||
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
|
||||
gpu_SYSPATH =
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
|
||||
EXTRAMAKE = Makefile.lammps.standard
|
||||
|
||||
ifeq($(CUDA_HOME),)
|
||||
ifeq ($(CUDA_HOME),)
|
||||
CUDA_HOME = /usr/local/cuda
|
||||
endif
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
|
||||
OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
|
||||
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
|
||||
-I$(CUDA_HOME)/include
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
EXTRAMAKE = Makefile.lammps.mingw-cross
|
||||
|
||||
@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
|
||||
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
|
||||
-I../../tools/mingw-cross/mpich2-win32/include/ \
|
||||
-DMPICH_IGNORE_CXX_SEEK
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
||||
@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
|
||||
OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
|
||||
-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
|
||||
-I$(CUDA_HOME)/include
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../src/STUBS -lmpi_mingw64
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
||||
@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
|
||||
-I../../tools/mingw-cross/mpich2-win64/include/ \
|
||||
-DMPICH_IGNORE_CXX_SEEK
|
||||
|
||||
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
|
||||
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
|
||||
-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
|
||||
OCL_PREC = -D_SINGLE_DOUBLE
|
||||
OCL_TUNE = -DFERMI_OCL
|
||||
|
||||
@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
cuDriverGetVersion(&driver_version);
|
||||
out << "CUDA Driver Version: "
|
||||
<< driver_version/1000 << "." << driver_version%100
|
||||
<< std::endl;
|
||||
<< std::endl;
|
||||
#endif
|
||||
|
||||
if (num_devices() == 0)
|
||||
|
||||
@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
||||
gpu_nbor=1;
|
||||
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
|
||||
int _gpu_host=0;
|
||||
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
|
||||
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
||||
if (!success)
|
||||
return NULL;
|
||||
|
||||
nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
// originally the requirement that nall == nlist was enforced
|
||||
// to allow direct indexing neighbors of neighbors after re-arrangement
|
||||
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
|
||||
|
||||
// now the requirement is removed, allowing to work within pair hybrid
|
||||
nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
|
||||
|
||||
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
|
||||
#ifdef THREE_CONCURRENT
|
||||
@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
const int nall, double **host_x,
|
||||
int *host_type, double *sublo,
|
||||
double *subhi, tagint *tag,
|
||||
int **nspecial, tagint **special,
|
||||
bool &success) {
|
||||
success=true;
|
||||
resize_atom(inum,nall,success);
|
||||
resize_local(nall,host_inum,nbor->max_nbors(),success);
|
||||
@ -214,7 +220,7 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
|
||||
// Copy nbor list from host if necessary and then calculate forces, virials,..
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
|
||||
const int nlist, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
@ -230,7 +236,7 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
}
|
||||
|
||||
int ago=hd_balancer.ago_first(f_ago);
|
||||
int inum=hd_balancer.balance(ago,nlocal,cpu_time);
|
||||
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
|
||||
ans->inum(inum);
|
||||
#ifdef THREE_CONCURRENT
|
||||
ans2->inum(inum);
|
||||
|
||||
@ -205,6 +205,7 @@ class BaseThree {
|
||||
protected:
|
||||
bool _compiled;
|
||||
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
|
||||
int _gpu_nbor;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _gpu_overhead, _driver_overhead;
|
||||
UCL_D_Vec<int> *_nbor_data;
|
||||
|
||||
@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
host_d,host_offset);
|
||||
|
||||
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||
@ -113,7 +113,7 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
|
||||
host_born1,host_born2,host_born3);
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
host_d,host_offset);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
||||
@ -84,7 +84,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
|
||||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
host_d,host_offset);
|
||||
|
||||
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||
|
||||
@ -84,7 +84,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
|
||||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_d,host_offset);
|
||||
host_d,host_offset);
|
||||
|
||||
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
|
||||
|
||||
@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -81,7 +81,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,
|
||||
|
||||
@ -83,7 +83,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||
|
||||
@ -80,7 +80,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -117,9 +117,9 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
|
||||
}
|
||||
|
||||
void crml_gpu_compute(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
int *ilist, int *numj, int **firstneigh,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool eflag, const bool vflag, const bool eatom,
|
||||
const bool vatom, int &host_start, const double cpu_time,
|
||||
bool &success, double *host_q, const int nlocal,
|
||||
double *boxlo, double *prd) {
|
||||
|
||||
@ -51,7 +51,7 @@ int CoulLongT::init(const int ntypes, double **host_scale,
|
||||
const double qqrd2e, const double g_ewald) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
|
||||
gpu_split,_screen,coul_long,"k_coul_long");
|
||||
gpu_split,_screen,coul_long,"k_coul_long");
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
||||
@ -40,9 +40,9 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
|
||||
int init(const int ntypes, double **scale,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
const double gpu_split, FILE *screen,
|
||||
const double host_cut_coulsq, double *host_special_coul,
|
||||
const double qqrd2e, const double g_ewald);
|
||||
|
||||
/// Send updated coeffs from host to device (to be compatible with fix adapt)
|
||||
void reinit(const int ntypes, double **scale);
|
||||
|
||||
@ -114,28 +114,28 @@ void cl_gpu_clear() {
|
||||
}
|
||||
|
||||
int** cl_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd) {
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, tagint *tag, int **nspecial,
|
||||
tagint **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success, double *host_q, double *boxlo,
|
||||
double *prd) {
|
||||
return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success,
|
||||
host_q, boxlo, prd);
|
||||
}
|
||||
|
||||
void cl_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success, double *host_q,
|
||||
const int nlocal, double *boxlo, double *prd) {
|
||||
CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q,nlocal,boxlo,prd);
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
|
||||
host_q,nlocal,boxlo,prd);
|
||||
}
|
||||
|
||||
double cl_gpu_bytes() {
|
||||
|
||||
@ -650,7 +650,7 @@ int DeviceT::compile_kernels() {
|
||||
int flag=0;
|
||||
|
||||
if (_compiled)
|
||||
return flag;
|
||||
return flag;
|
||||
|
||||
dev_program=new UCL_Program(*gpu);
|
||||
int success=dev_program->load_string(device,compile_string().c_str());
|
||||
|
||||
@ -238,7 +238,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
|
||||
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
|
||||
r3inv = r2inv*rinv;
|
||||
r5inv = r3inv*r2inv;
|
||||
r7inv = r5inv*r2inv;
|
||||
r7inv = r5inv*r2inv;
|
||||
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
|
||||
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
|
||||
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
|
||||
|
||||
@ -76,7 +76,7 @@ int DPDT::init(const int ntypes,
|
||||
|
||||
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma,
|
||||
host_sigma,host_cut);
|
||||
host_sigma,host_cut);
|
||||
|
||||
UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
@ -164,7 +164,7 @@ void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma,
|
||||
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_ONLY);
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma,
|
||||
host_sigma,host_cut);
|
||||
host_sigma,host_cut);
|
||||
}
|
||||
|
||||
template class DPD<PRECISION,ACC_PRECISION>;
|
||||
|
||||
@ -75,7 +75,7 @@ int GaussT::init(const int ntypes,
|
||||
|
||||
gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
|
||||
host_cutsq,host_offset);
|
||||
host_cutsq,host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
@ -99,7 +99,7 @@ void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a,
|
||||
host_write[i]=0.0;
|
||||
|
||||
this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b,
|
||||
host_cutsq,host_offset);
|
||||
host_cutsq,host_offset);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
||||
@ -84,19 +84,19 @@ int GayBerneT::init(const int ntypes, const double gamma,
|
||||
|
||||
sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
|
||||
host_sigma,host_epsilon);
|
||||
host_sigma,host_epsilon);
|
||||
|
||||
this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
|
||||
host_cutsq,h_form);
|
||||
host_cutsq,h_form);
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq,h_form);
|
||||
host_cutsq,h_form);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
dev_error.zero();
|
||||
@ -209,7 +209,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
|
||||
this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
this->time_nbor1.stop();
|
||||
|
||||
this->time_ellipsoid.start();
|
||||
@ -242,7 +242,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
||||
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
|
||||
this->_last_ellipse)/BX));
|
||||
this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
|
||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
|
||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
|
||||
this->time_nbor2.stop();
|
||||
|
||||
this->time_ellipsoid2.start();
|
||||
@ -300,7 +300,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
|
||||
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
this->time_nbor1.start();
|
||||
this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
this->time_nbor1.stop();
|
||||
this->time_ellipsoid.start();
|
||||
this->k_ellipsoid.set_size(GX,BX);
|
||||
|
||||
@ -26,58 +26,58 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
|
||||
den = ucl_recip(den);
|
||||
|
||||
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
|
||||
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
|
||||
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
|
||||
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
|
||||
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
|
||||
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
|
||||
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
|
||||
|
||||
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
|
||||
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
|
||||
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
|
||||
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
|
||||
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
|
||||
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
|
||||
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
|
||||
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
|
||||
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
|
||||
|
||||
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
|
||||
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
|
||||
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
|
||||
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
|
||||
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
|
||||
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
|
||||
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
|
||||
|
||||
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
|
||||
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
|
||||
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
|
||||
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
|
||||
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
|
||||
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
|
||||
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
|
||||
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
|
||||
|
||||
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
|
||||
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
|
||||
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
|
||||
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
|
||||
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
|
||||
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
|
||||
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
|
||||
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
|
||||
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
|
||||
|
||||
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
|
||||
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
|
||||
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
|
||||
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
|
||||
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
|
||||
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
|
||||
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
|
||||
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
|
||||
|
||||
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
|
||||
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
|
||||
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
|
||||
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
|
||||
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
|
||||
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
|
||||
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
|
||||
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
|
||||
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
|
||||
|
||||
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
|
||||
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
|
||||
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
|
||||
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
|
||||
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
|
||||
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
|
||||
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
|
||||
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
|
||||
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
|
||||
|
||||
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
|
||||
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
|
||||
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
|
||||
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
|
||||
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
|
||||
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
|
||||
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
|
||||
}
|
||||
|
||||
__kernel void k_gayberne(const __global numtyp4 *restrict x_,
|
||||
|
||||
@ -76,11 +76,11 @@ int LJT::init(const int ntypes,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq);
|
||||
host_cutsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -76,11 +76,11 @@ int LJ96T::init(const int ntypes,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq);
|
||||
host_cutsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -80,11 +80,11 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq, host_cut_ljsq);
|
||||
host_cutsq, host_cut_ljsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
|
||||
@ -79,11 +79,11 @@ int LJCoulT::init(const int ntypes,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cut_ljsq, host_cut_coulsq);
|
||||
host_cut_ljsq, host_cut_coulsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||
|
||||
@ -80,11 +80,11 @@ int LJCoulDebyeT::init(const int ntypes,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cut_ljsq, host_cut_coulsq);
|
||||
host_cut_ljsq, host_cut_coulsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
|
||||
|
||||
@ -80,11 +80,11 @@ int LJCoulLongT::init(const int ntypes,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq, host_cut_ljsq);
|
||||
host_cutsq, host_cut_ljsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
|
||||
@ -81,11 +81,11 @@ int LJCoulMSMT::init(const int ntypes,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq, host_cut_ljsq);
|
||||
host_cutsq, host_cut_ljsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
// pack gcons and dgcons
|
||||
int nrows, ncols;
|
||||
|
||||
@ -77,11 +77,11 @@ int LJCubicT::init(const int ntypes,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq);
|
||||
host_cutsq);
|
||||
|
||||
lj2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj2,host_write,host_cut_inner_sq,
|
||||
host_cut_inner,host_sigma,host_epsilon);
|
||||
host_cut_inner,host_sigma,host_epsilon);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4);
|
||||
|
||||
@ -84,11 +84,11 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cut_ljsq, host_cutsq);
|
||||
host_cut_ljsq, host_cutsq);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
|
||||
for (int i=0; i<4; i++) {
|
||||
|
||||
@ -76,11 +76,11 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq, host_shift);
|
||||
host_cutsq, host_shift);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -82,9 +82,9 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
numtyp r = ucl_sqrt(r2inv);
|
||||
numtyp rshift = r - lj1[mtype].w;
|
||||
numtyp rshiftsq = rshift*rshift;
|
||||
r2inv = ucl_recip(rshiftsq);
|
||||
numtyp rshift = r - lj1[mtype].w;
|
||||
numtyp rshiftsq = rshift*rshift;
|
||||
r2inv = ucl_recip(rshiftsq);
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
force*=factor_lj/rshift/r;
|
||||
@ -175,9 +175,9 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
numtyp r = ucl_sqrt(r2inv);
|
||||
numtyp rshift = r - lj1[mtype].w;
|
||||
numtyp rshiftsq = rshift*rshift;
|
||||
r2inv = ucl_recip(rshiftsq);
|
||||
numtyp rshift = r - lj1[mtype].w;
|
||||
numtyp rshiftsq = rshift*rshift;
|
||||
r2inv = ucl_recip(rshiftsq);
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
force*=factor_lj/rshift/r;
|
||||
|
||||
@ -76,11 +76,11 @@ int MieT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
mie1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,mie1,host_write,host_mie1,host_mie2,
|
||||
host_gamA,host_gamR);
|
||||
host_gamA,host_gamR);
|
||||
|
||||
mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4,
|
||||
host_offset,host_cutsq);
|
||||
host_offset,host_cutsq);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -127,7 +127,10 @@ void Neighbor::alloc(bool &success) {
|
||||
dev_packed.clear();
|
||||
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
|
||||
_packed_permissions)==UCL_SUCCESS);
|
||||
_c_bytes+=dev_packed.row_bytes();
|
||||
dev_acc.clear();
|
||||
success=success && (dev_acc.alloc(_max_atoms,*dev,
|
||||
UCL_READ_WRITE)==UCL_SUCCESS);
|
||||
_c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
|
||||
}
|
||||
if (_max_host>0) {
|
||||
nbor_host.clear();
|
||||
@ -194,6 +197,7 @@ void Neighbor::clear() {
|
||||
|
||||
host_packed.clear();
|
||||
host_acc.clear();
|
||||
dev_acc.clear();
|
||||
dev_nbor.clear();
|
||||
nbor_host.clear();
|
||||
dev_packed.clear();
|
||||
@ -225,7 +229,7 @@ double Neighbor::host_memory_usage() const {
|
||||
}
|
||||
|
||||
void Neighbor::get_host(const int inum, int *ilist, int *numj,
|
||||
int **firstneigh, const int block_size) {
|
||||
int **firstneigh, const int block_size) {
|
||||
_nbor_time_avail=true;
|
||||
time_nbor.start();
|
||||
|
||||
@ -278,6 +282,15 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
|
||||
UCL_D_Vec<int> acc_view;
|
||||
acc_view.view_offset(inum,dev_nbor,inum*2);
|
||||
ucl_copy(acc_view,host_acc,true);
|
||||
|
||||
UCL_H_Vec<int> host_view;
|
||||
host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
|
||||
for (int ii=0; ii<inum; ii++) {
|
||||
int i=ilist[ii];
|
||||
host_view[i] = ii;
|
||||
}
|
||||
ucl_copy(dev_acc,host_view,true);
|
||||
|
||||
time_nbor.stop();
|
||||
|
||||
if (_use_packing==false) {
|
||||
|
||||
@ -199,6 +199,8 @@ class Neighbor {
|
||||
UCL_H_Vec<int> host_packed;
|
||||
/// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
|
||||
UCL_H_Vec<int> host_acc;
|
||||
/// Device storage for accessing atom indices from the neighbor list (3-body)
|
||||
UCL_D_Vec<int> dev_acc;
|
||||
|
||||
// ----------------- Data for GPU Neighbor Calculation ---------------
|
||||
|
||||
|
||||
@ -118,24 +118,24 @@ __kernel void transpose(__global tagint *restrict out,
|
||||
const __global tagint *restrict in,
|
||||
int columns_in, int rows_in)
|
||||
{
|
||||
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||
|
||||
unsigned ti=THREAD_ID_X;
|
||||
unsigned tj=THREAD_ID_Y;
|
||||
unsigned bi=BLOCK_ID_X;
|
||||
unsigned bj=BLOCK_ID_Y;
|
||||
unsigned ti=THREAD_ID_X;
|
||||
unsigned tj=THREAD_ID_Y;
|
||||
unsigned bi=BLOCK_ID_X;
|
||||
unsigned bj=BLOCK_ID_Y;
|
||||
|
||||
unsigned i=bi*BLOCK_CELL_2D+ti;
|
||||
unsigned j=bj*BLOCK_CELL_2D+tj;
|
||||
if ((i<columns_in) && (j<rows_in))
|
||||
block[tj][ti]=in[j*columns_in+i];
|
||||
unsigned i=bi*BLOCK_CELL_2D+ti;
|
||||
unsigned j=bj*BLOCK_CELL_2D+tj;
|
||||
if ((i<columns_in) && (j<rows_in))
|
||||
block[tj][ti]=in[j*columns_in+i];
|
||||
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
|
||||
i=bj*BLOCK_CELL_2D+ti;
|
||||
j=bi*BLOCK_CELL_2D+tj;
|
||||
if ((i<rows_in) && (j<columns_in))
|
||||
out[j*rows_in+i] = block[ti][tj];
|
||||
i=bj*BLOCK_CELL_2D+ti;
|
||||
j=bi*BLOCK_CELL_2D+tj;
|
||||
if ((i<rows_in) && (j<columns_in))
|
||||
out[j*rows_in+i] = block[ti][tj];
|
||||
}
|
||||
|
||||
__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
@ -191,7 +191,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
nbor_list[pid_i]=pid_i;
|
||||
} else {
|
||||
stride=0;
|
||||
neigh_counts=host_numj+pid_i-inum;
|
||||
neigh_counts=host_numj+pid_i-inum;
|
||||
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
|
||||
}
|
||||
|
||||
@ -243,8 +243,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
} // for (k)
|
||||
__syncthreads();
|
||||
} // for (k)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -51,7 +51,7 @@ void NeighborShared::clear() {
|
||||
void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
|
||||
const std::string flags) {
|
||||
if (_compiled)
|
||||
return;
|
||||
return;
|
||||
|
||||
_gpu_nbor=gpu_nbor;
|
||||
if (_gpu_nbor==0) {
|
||||
|
||||
@ -270,19 +270,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
|
||||
int my=mz+fast_mul(ny,npts_x);
|
||||
for (int m=0; m<order; m++) {
|
||||
grdtyp y0=z0*rho1d_1[m][tid];
|
||||
for (int l=0; l<order; l++) {
|
||||
grdtyp x0=y0*rho1d_0[l][tid];
|
||||
grdtyp4 el=brick[my+l];
|
||||
ek.x-=x0*el.x;
|
||||
ek.y-=x0*el.y;
|
||||
ek.z-=x0*el.z;
|
||||
}
|
||||
for (int l=0; l<order; l++) {
|
||||
grdtyp x0=y0*rho1d_0[l][tid];
|
||||
grdtyp4 el=brick[my+l];
|
||||
ek.x-=x0*el.x;
|
||||
ek.y-=x0*el.y;
|
||||
ek.z-=x0*el.z;
|
||||
}
|
||||
my+=npts_x;
|
||||
}
|
||||
mz+=npts_yx;
|
||||
}
|
||||
}
|
||||
}
|
||||
ans[ii]=ek;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
|
||||
#define OCL_DEFAULT_VENDOR "generic"
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_OCL
|
||||
#define OCL_DEFAULT_VENDOR "intel"
|
||||
#endif
|
||||
|
||||
#ifdef PHI_OCL
|
||||
#define OCL_DEFAULT_VENDOR "phi"
|
||||
#endif
|
||||
|
||||
#ifndef OCL_DEFAULT_VENDOR
|
||||
#define OCL_DEFAULT_VENDOR "none"
|
||||
#endif
|
||||
|
||||
@ -81,19 +81,19 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
|
||||
|
||||
sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
|
||||
host_sigma,host_epsilon);
|
||||
host_sigma,host_epsilon);
|
||||
|
||||
this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
|
||||
host_cutsq,h_form);
|
||||
host_cutsq,h_form);
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq,h_form);
|
||||
host_cutsq,h_form);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
host_offset);
|
||||
|
||||
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
dev_error.zero();
|
||||
@ -197,7 +197,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
|
||||
this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
this->time_nbor1.stop();
|
||||
|
||||
this->time_ellipsoid.start();
|
||||
@ -214,7 +214,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
// ------------ ELLIPSE_SPHERE ---------------
|
||||
this->time_nbor2.start();
|
||||
this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
|
||||
ELLIPSE_SPHERE,_shared_types,_lj_types);
|
||||
ELLIPSE_SPHERE,_shared_types,_lj_types);
|
||||
this->time_nbor2.stop();
|
||||
|
||||
this->time_ellipsoid2.start();
|
||||
@ -245,7 +245,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
|
||||
this->_last_ellipse)/BX));
|
||||
this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
|
||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
|
||||
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
|
||||
this->time_nbor3.stop();
|
||||
|
||||
this->time_ellipsoid3.start();
|
||||
@ -300,7 +300,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
|
||||
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
this->time_nbor1.start();
|
||||
this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
|
||||
this->time_nbor1.stop();
|
||||
this->time_ellipsoid.start();
|
||||
this->k_ellipsoid.set_size(GX,BX);
|
||||
|
||||
@ -74,7 +74,7 @@ int SoftT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_prefactor,
|
||||
host_cut,host_cutsq);
|
||||
host_cut,host_cutsq);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
@ -98,7 +98,7 @@ void SoftT::reinit(const int ntypes, double **host_cutsq,
|
||||
host_write[i]=0.0;
|
||||
|
||||
this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor,
|
||||
host_cut,host_cutsq);
|
||||
host_cut,host_cutsq);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
|
||||
@ -196,11 +196,12 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1
|
||||
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
|
||||
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
|
||||
int ainum=this->ans->inum();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
@ -230,18 +231,21 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
|
||||
&map, &elem2param, &_nelements,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
|
||||
@ -195,7 +195,6 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
numtyp sw_powerq=sw2_ijparam.w;
|
||||
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
|
||||
numtyp sw_cut=sw3_ijparam.x;
|
||||
numtyp sw_cutsq=sw3_ijparam.y;
|
||||
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
|
||||
pow(sw_sigma,sw_powerp);
|
||||
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
|
||||
@ -345,7 +344,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
const int t_per_atom, const int evatom) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
@ -394,8 +392,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
|
||||
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
@ -467,14 +459,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
@ -522,18 +514,20 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (rsq2 < sw3_ikparam.y) {
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
@ -607,14 +597,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const int nelements,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
|
||||
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
|
||||
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
|
||||
|
||||
@ -662,18 +652,20 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
if (rsq1 > sw3_ijparam.y) continue;
|
||||
|
||||
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
|
||||
sw_sigma=sw1_ijparam.y;
|
||||
sw_gamma=sw1_ijparam.w;
|
||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ij=sw3_ijparam.x;
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
if (rsq2 < sw3_ikparam.y) {
|
||||
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
|
||||
sw_sigma=sw1_ikparam.y;
|
||||
sw_gamma=sw1_ikparam.w;
|
||||
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
|
||||
sw_cut_ik=sw3_ikparam.x;
|
||||
|
||||
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
|
||||
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
|
||||
sw_epsilon=sw1_ijkparam.x;
|
||||
sw_lambda=sw1_ijkparam.z;
|
||||
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
|
||||
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
|
||||
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
|
||||
|
||||
@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int eflag, const int nall, const int inum,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
tpa_sq = fast_mul(t_per_atom,t_per_atom);
|
||||
@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<nall) {
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
@ -597,11 +597,12 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -666,13 +667,17 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -810,7 +815,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict ts1_in,
|
||||
const __global numtyp4 *restrict ts2_in,
|
||||
const __global numtyp4 *restrict ts4_in,
|
||||
const __global numtyp4 *restrict ts4_in,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
@ -818,11 +823,12 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -887,13 +893,17 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -964,7 +974,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp delr2[3];
|
||||
delr2[0] = kx.x-jx.x;
|
||||
delr2[1] = kx.y-jx.y;
|
||||
delr2[1] = kx.y-jx.y;
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
|
||||
@ -186,7 +186,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
|
||||
if (tmp > param_c2)
|
||||
return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
|
||||
// error in negligible 2nd term fixed 9/30/2015
|
||||
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
|
||||
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
|
||||
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
|
||||
ucl_powr(tmp,-param_powern)));
|
||||
if (tmp < param_c4) return (numtyp)0.0;
|
||||
|
||||
@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int eflag, const int nall, const int inum,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
tpa_sq = fast_mul(t_per_atom,t_per_atom);
|
||||
@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<nall) {
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
@ -605,11 +605,12 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
|
||||
@ -676,13 +677,17 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -826,8 +831,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict ts1_in,
|
||||
const __global numtyp4 *restrict ts2_in,
|
||||
const __global numtyp4 *restrict ts4_in,
|
||||
const __global numtyp4 *restrict ts5_in,
|
||||
const __global numtyp4 *restrict ts4_in,
|
||||
const __global numtyp4 *restrict ts5_in,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
@ -835,11 +840,12 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
|
||||
@ -906,13 +912,17 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -983,7 +993,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp delr2[3];
|
||||
delr2[0] = kx.x-jx.x;
|
||||
delr2[1] = kx.y-jx.y;
|
||||
delr2[1] = kx.y-jx.y;
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
|
||||
@ -180,12 +180,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
|
||||
{
|
||||
numtyp tmp = param_beta * zeta;
|
||||
if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) *
|
||||
ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
|
||||
ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
|
||||
if (tmp < param_ca4) return (numtyp)0.0;
|
||||
|
||||
numtyp tmp_n = ucl_powr(tmp,param_powern);
|
||||
return (numtyp)-0.5 *(param_powern/param_powern_del) *
|
||||
ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
|
||||
ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
|
||||
((numtyp)2.0*param_powern_del)))*tmp_n / zeta;
|
||||
}
|
||||
|
||||
|
||||
@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
else
|
||||
_eflag=0;
|
||||
|
||||
int ainum=nall;
|
||||
int ainum=nlist;
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
int BX=this->block_pair();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
|
||||
@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
|
||||
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
int evatom=0;
|
||||
if (eatom || vatom)
|
||||
@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
|
||||
} else {
|
||||
this->k_three_end.set_size(GX,BX);
|
||||
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
|
||||
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->nbor->dev_acc,
|
||||
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
|
||||
}
|
||||
|
||||
this->time_pair.stop();
|
||||
|
||||
@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
__global acctyp4 * zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int eflag, const int nall, const int inum,
|
||||
const int eflag, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
__local int tpa_sq,n_stride;
|
||||
tpa_sq = fast_mul(t_per_atom,t_per_atom);
|
||||
@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<nall) {
|
||||
if (ii<inum) {
|
||||
int nbor_j, nbor_end;
|
||||
int i, numj;
|
||||
|
||||
@ -617,11 +617,12 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -686,13 +687,17 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -830,7 +835,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
__kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict ts1_in,
|
||||
const __global numtyp4 *restrict ts2_in,
|
||||
const __global numtyp4 *restrict ts4_in,
|
||||
const __global numtyp4 *restrict ts4_in,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
@ -838,11 +843,12 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
const __global acctyp4 *restrict zetaij,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const __global int * dev_acc,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
const int eflag, const int vflag,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
const int t_per_atom, const int gpu_nbor) {
|
||||
__local int tpa_sq, n_stride;
|
||||
tpa_sq=fast_mul(t_per_atom,t_per_atom);
|
||||
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
|
||||
@ -907,13 +913,17 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
mdelr1[1] = -delr1[1];
|
||||
mdelr1[2] = -delr1[2];
|
||||
|
||||
int nbor_k=j+nbor_pitch;
|
||||
int numk=dev_nbor[nbor_k];
|
||||
int nbor_k,numk;
|
||||
if (dev_nbor==dev_packed) {
|
||||
if (gpu_nbor) nbor_k=j+nbor_pitch;
|
||||
else nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
|
||||
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
|
||||
nbor_k+=offset_k;
|
||||
} else {
|
||||
nbor_k=dev_acc[j]+nbor_pitch;
|
||||
numk=dev_nbor[nbor_k];
|
||||
nbor_k+=nbor_pitch;
|
||||
nbor_k=dev_nbor[nbor_k];
|
||||
k_end=nbor_k+numk;
|
||||
@ -984,7 +994,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp delr2[3];
|
||||
delr2[0] = kx.x-jx.x;
|
||||
delr2[1] = kx.y-jx.y;
|
||||
delr2[1] = kx.y-jx.y;
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
|
||||
@ -212,7 +212,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
|
||||
if (tmp > param_c2)
|
||||
return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
|
||||
// error in negligible 2nd term fixed 9/30/2015
|
||||
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
|
||||
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
|
||||
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
|
||||
ucl_powr(tmp,-param_powern)));
|
||||
if (tmp < param_c4) return (numtyp)0.0;
|
||||
|
||||
@ -75,7 +75,7 @@ int YukawaT::init(const int ntypes,
|
||||
|
||||
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,host_offset,
|
||||
host_cutsq);
|
||||
host_cutsq);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -96,7 +96,7 @@ int YukawaColloidT::init(const int ntypes,
|
||||
|
||||
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
|
||||
host_offset,host_cutsq);
|
||||
host_offset,host_cutsq);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
@ -89,10 +89,10 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
|
||||
if (rsq<coeff[mtype].z) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
|
||||
force = factor_lj*force * rinv;
|
||||
force = factor_lj*force * rinv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
@ -181,10 +181,10 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
|
||||
if (rsq<coeff[mtype].z) {
|
||||
numtyp r = ucl_sqrt(rsq);
|
||||
numtyp rinv = ucl_recip(r);
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
|
||||
numtyp force = coeff[mtype].x * screening;
|
||||
|
||||
force = factor_lj*force * rinv;
|
||||
force = factor_lj*force * rinv;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
|
||||
@ -79,11 +79,11 @@ int ZBLT::init(const int ntypes, double **host_cutsq,
|
||||
|
||||
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_sw1,host_sw2,
|
||||
host_zze, host_cutsq);
|
||||
host_zze, host_cutsq);
|
||||
|
||||
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_d1a,host_d2a,
|
||||
host_d3a,host_d4a);
|
||||
host_d3a,host_d4a);
|
||||
|
||||
coeff3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,coeff3,host_write,host_sw3,host_sw4,host_sw5);
|
||||
|
||||
@ -134,10 +134,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
|
||||
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
|
||||
force *= (numtyp)-1.0*ucl_recip(r);
|
||||
|
||||
@ -148,10 +148,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
|
||||
if (eflag>0) {
|
||||
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
|
||||
energy+=e;
|
||||
}
|
||||
@ -237,10 +237,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
|
||||
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
if (rsq>cut_innersq) {
|
||||
t = r - cut_inner;
|
||||
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
|
||||
}
|
||||
|
||||
force *= (numtyp)-1.0*ucl_recip(r);
|
||||
|
||||
@ -251,10 +251,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
|
||||
if (eflag>0) {
|
||||
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
|
||||
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
e += coeff3[mtype].z;
|
||||
if (rsq > cut_innersq) {
|
||||
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
|
||||
}
|
||||
|
||||
energy+=e;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user