git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15248 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2016-07-01 23:27:26 +00:00
parent 8366b35459
commit 9656958169
245 changed files with 4890 additions and 4832 deletions

View File

@ -1,6 +1,6 @@
# Settings that the LAMMPS build will import when this package library is used
# settings for OpenCL builds
gpu_SYSINC =
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -lOpenCL
gpu_SYSLIB = -Wl,--enable-stdcall-fixup -L../../tools/mingw-cross$(LIBOBJDIR) -Wl,-Bdynamic,-lOpenCL,-Bstatic
gpu_SYSPATH =

View File

@ -7,7 +7,7 @@
EXTRAMAKE = Makefile.lammps.standard
ifeq($(CUDA_HOME),)
ifeq ($(CUDA_HOME),)
CUDA_HOME = /usr/local/cuda
endif

View File

@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-I$(CUDA_HOME)/include
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL -L../../src/STUBS -lmpi_mingw32
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL
EXTRAMAKE = Makefile.lammps.mingw-cross

View File

@ -4,7 +4,7 @@ OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
-I../../tools/mingw-cross/mpich2-win32/include/ \
-DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -lOpenCL \
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL

View File

@ -3,7 +3,7 @@ CUDA_HOME = ../../tools/mingw-cross/OpenCL
OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-I$(CUDA_HOME)/include
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../src/STUBS -lmpi_mingw64
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL

View File

@ -5,7 +5,7 @@ OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-I../../tools/mingw-cross/mpich2-win64/include/ \
-DMPICH_IGNORE_CXX_SEEK
OCL_LINK = -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -lOpenCL \
OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
OCL_PREC = -D_SINGLE_DOUBLE
OCL_TUNE = -DFERMI_OCL

View File

@ -365,7 +365,7 @@ void UCL_Device::print_all(std::ostream &out) {
cuDriverGetVersion(&driver_version);
out << "CUDA Driver Version: "
<< driver_version/1000 << "." << driver_version%100
<< std::endl;
<< std::endl;
#endif
if (num_devices() == 0)

View File

@ -62,6 +62,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
gpu_nbor=1;
else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
_gpu_nbor=gpu_nbor;
int _gpu_host=0;
int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
@ -168,7 +169,12 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
if (!success)
return NULL;
nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
// originally the requirement that nall == nlist was enforced
// to allow direct indexing neighbors of neighbors after re-arrangement
// nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
// now the requirement is removed, allowing to work within pair hybrid
nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
#ifdef THREE_CONCURRENT
@ -185,11 +191,11 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
int **nspecial, tagint **special,
bool &success) {
const int nall, double **host_x,
int *host_type, double *sublo,
double *subhi, tagint *tag,
int **nspecial, tagint **special,
bool &success) {
success=true;
resize_atom(inum,nall,success);
resize_local(nall,host_inum,nbor->max_nbors(),success);
@ -214,7 +220,7 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
// Copy nbor list from host if necessary and then calculate forces, virials,..
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
const int nlist, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
@ -230,7 +236,7 @@ void BaseThreeT::compute(const int f_ago, const int nlocal, const int nall,
}
int ago=hd_balancer.ago_first(f_ago);
int inum=hd_balancer.balance(ago,nlocal,cpu_time);
int inum=hd_balancer.balance(ago,inum_full,cpu_time);
ans->inum(inum);
#ifdef THREE_CONCURRENT
ans2->inum(inum);

View File

@ -205,6 +205,7 @@ class BaseThree {
protected:
bool _compiled;
int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
int _gpu_nbor;
double _max_bytes, _max_an_bytes;
double _gpu_overhead, _driver_overhead;
UCL_D_Vec<int> *_nbor_data;

View File

@ -80,7 +80,7 @@ int BornT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset);
host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,
@ -113,7 +113,7 @@ void BornT::reinit(const int ntypes, double **host_rhoinv,
this->atom->type_pack4(ntypes,_lj_types,coeff1,host_write,host_rhoinv,
host_born1,host_born2,host_born3);
this->atom->type_pack4(ntypes,_lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset);
host_d,host_offset);
}
template <class numtyp, class acctyp>

View File

@ -84,7 +84,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset);
host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,

View File

@ -84,7 +84,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_d,host_offset);
host_d,host_offset);
cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq_sigma,host_write,host_cutsq,

View File

@ -79,7 +79,7 @@ int BuckT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_offset);
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -81,7 +81,7 @@ int BuckCoulT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_offset);
host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,cutsq,host_write,host_cutsq,

View File

@ -83,7 +83,7 @@ int BuckCoulLongT::init(const int ntypes, double **host_cutsq,
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_a,host_c,
host_offset);
host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);

View File

@ -80,7 +80,7 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -117,9 +117,9 @@ int** crml_gpu_compute_n(const int ago, const int inum_full,
}
void crml_gpu_compute(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
const int nall, double **host_x, int *host_type,
int *ilist, int *numj, int **firstneigh,
const bool eflag, const bool vflag, const bool eatom,
const bool eflag, const bool vflag, const bool eatom,
const bool vatom, int &host_start, const double cpu_time,
bool &success, double *host_q, const int nlocal,
double *boxlo, double *prd) {

View File

@ -51,7 +51,7 @@ int CoulLongT::init(const int ntypes, double **host_scale,
const double qqrd2e, const double g_ewald) {
int success;
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
gpu_split,_screen,coul_long,"k_coul_long");
gpu_split,_screen,coul_long,"k_coul_long");
if (success!=0)
return success;

View File

@ -40,9 +40,9 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
int init(const int ntypes, double **scale,
const int nlocal, const int nall, const int max_nbors,
const int maxspecial, const double cell_size,
const double gpu_split, FILE *screen,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
const double gpu_split, FILE *screen,
const double host_cut_coulsq, double *host_special_coul,
const double qqrd2e, const double g_ewald);
/// Send updated coeffs from host to device (to be compatible with fix adapt)
void reinit(const int ntypes, double **scale);

View File

@ -114,28 +114,28 @@ void cl_gpu_clear() {
}
int** cl_gpu_compute_n(const int ago, const int inum_full,
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
const int nall, double **host_x, int *host_type,
double *sublo, double *subhi, tagint *tag, int **nspecial,
tagint **special, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
int **ilist, int **jnum, const double cpu_time,
bool &success, double *host_q, double *boxlo,
double *prd) {
return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
subhi, tag, nspecial, special, eflag, vflag, eatom,
vatom, host_start, ilist, jnum, cpu_time, success,
host_q, boxlo, prd);
}
void cl_gpu_compute(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
double **host_x, int *host_type, int *ilist, int *numj,
int **firstneigh, const bool eflag, const bool vflag,
const bool eatom, const bool vatom, int &host_start,
const double cpu_time, bool &success, double *host_q,
const int nlocal, double *boxlo, double *prd) {
CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
host_q,nlocal,boxlo,prd);
}
double cl_gpu_bytes() {

View File

@ -650,7 +650,7 @@ int DeviceT::compile_kernels() {
int flag=0;
if (_compiled)
return flag;
return flag;
dev_program=new UCL_Program(*gpu);
int success=dev_program->load_string(device,compile_string().c_str());

View File

@ -238,7 +238,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
if (mui.w > (numtyp)0.0 && muj.w > (numtyp)0.0) {
r3inv = r2inv*rinv;
r5inv = r3inv*r2inv;
r7inv = r5inv*r2inv;
r7inv = r5inv*r2inv;
pdotp = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;

View File

@ -76,7 +76,7 @@ int DPDT::init(const int ntypes,
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma,
host_sigma,host_cut);
host_sigma,host_cut);
UCL_H_Vec<numtyp> host_rsq(lj_types*lj_types,*(this->ucl_device),
UCL_WRITE_ONLY);
@ -164,7 +164,7 @@ void DPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma,
UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
UCL_WRITE_ONLY);
this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma,
host_sigma,host_cut);
host_sigma,host_cut);
}
template class DPD<PRECISION,ACC_PRECISION>;

View File

@ -75,7 +75,7 @@ int GaussT::init(const int ntypes,
gauss1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,gauss1,host_write,host_a,host_b,
host_cutsq,host_offset);
host_cutsq,host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -99,7 +99,7 @@ void GaussT::reinit(const int ntypes, double **host_cutsq, double **host_a,
host_write[i]=0.0;
this->atom->type_pack4(ntypes,_lj_types,gauss1,host_write,host_a,host_b,
host_cutsq,host_offset);
host_cutsq,host_offset);
}
template <class numtyp, class acctyp>

View File

@ -84,19 +84,19 @@ int GayBerneT::init(const int ntypes, const double gamma,
sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
host_sigma,host_epsilon);
host_sigma,host_epsilon);
this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
host_cutsq,h_form);
host_cutsq,h_form);
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq,h_form);
host_cutsq,h_form);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
dev_error.zero();
@ -209,7 +209,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
(BX/this->_threads_per_atom)));
NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor1.stop();
this->time_ellipsoid.start();
@ -242,7 +242,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/BX));
this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor2.stop();
this->time_ellipsoid2.start();
@ -300,7 +300,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
this->time_nbor1.start();
this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);

View File

@ -26,58 +26,58 @@ ucl_inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape
den = ucl_recip(den);
ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
(numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
(numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
(numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
(numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
(numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
(numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
(numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
(numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
(numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
(numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
(numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
}
__kernel void k_gayberne(const __global numtyp4 *restrict x_,

View File

@ -76,11 +76,11 @@ int LJT::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -76,11 +76,11 @@ int LJ96T::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -80,11 +80,11 @@ int LJClass2LongT::init(const int ntypes, double **host_cutsq,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
host_cutsq, host_cut_ljsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {

View File

@ -79,11 +79,11 @@ int LJCoulT::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cut_coulsq);
host_cut_ljsq, host_cut_coulsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);

View File

@ -80,11 +80,11 @@ int LJCoulDebyeT::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cut_coulsq);
host_cut_ljsq, host_cut_coulsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);

View File

@ -80,11 +80,11 @@ int LJCoulLongT::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
host_cutsq, host_cut_ljsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {

View File

@ -81,11 +81,11 @@ int LJCoulMSMT::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_cut_ljsq);
host_cutsq, host_cut_ljsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
// pack gcons and dgcons
int nrows, ncols;

View File

@ -77,11 +77,11 @@ int LJCubicT::init(const int ntypes,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq);
host_cutsq);
lj2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj2,host_write,host_cut_inner_sq,
host_cut_inner,host_sigma,host_epsilon);
host_cut_inner,host_sigma,host_epsilon);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4);

View File

@ -84,11 +84,11 @@ int LJDSFT::init(const int ntypes, double **host_cutsq, double **host_lj1,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cut_ljsq, host_cutsq);
host_cut_ljsq, host_cutsq);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
for (int i=0; i<4; i++) {

View File

@ -76,11 +76,11 @@ int LJExpandT::init(const int ntypes, double **host_cutsq,
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq, host_shift);
host_cutsq, host_shift);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -82,9 +82,9 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
int mtype=itype*lj_types+jtype;
if (r2inv<lj1[mtype].z) {
numtyp r = ucl_sqrt(r2inv);
numtyp rshift = r - lj1[mtype].w;
numtyp rshiftsq = rshift*rshift;
r2inv = ucl_recip(rshiftsq);
numtyp rshift = r - lj1[mtype].w;
numtyp rshiftsq = rshift*rshift;
r2inv = ucl_recip(rshiftsq);
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj/rshift/r;
@ -175,9 +175,9 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
if (r2inv<lj1[mtype].z) {
numtyp r = ucl_sqrt(r2inv);
numtyp rshift = r - lj1[mtype].w;
numtyp rshiftsq = rshift*rshift;
r2inv = ucl_recip(rshiftsq);
numtyp rshift = r - lj1[mtype].w;
numtyp rshiftsq = rshift*rshift;
r2inv = ucl_recip(rshiftsq);
numtyp r6inv = r2inv*r2inv*r2inv;
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
force*=factor_lj/rshift/r;

View File

@ -76,11 +76,11 @@ int MieT::init(const int ntypes, double **host_cutsq,
mie1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,mie1,host_write,host_mie1,host_mie2,
host_gamA,host_gamR);
host_gamA,host_gamR);
mie3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,mie3,host_write,host_mie3,host_mie4,
host_offset,host_cutsq);
host_offset,host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -127,7 +127,10 @@ void Neighbor::alloc(bool &success) {
dev_packed.clear();
success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
_packed_permissions)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes();
dev_acc.clear();
success=success && (dev_acc.alloc(_max_atoms,*dev,
UCL_READ_WRITE)==UCL_SUCCESS);
_c_bytes+=dev_packed.row_bytes()+dev_acc.row_bytes();
}
if (_max_host>0) {
nbor_host.clear();
@ -194,6 +197,7 @@ void Neighbor::clear() {
host_packed.clear();
host_acc.clear();
dev_acc.clear();
dev_nbor.clear();
nbor_host.clear();
dev_packed.clear();
@ -225,7 +229,7 @@ double Neighbor::host_memory_usage() const {
}
void Neighbor::get_host(const int inum, int *ilist, int *numj,
int **firstneigh, const int block_size) {
int **firstneigh, const int block_size) {
_nbor_time_avail=true;
time_nbor.start();
@ -278,6 +282,15 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
UCL_D_Vec<int> acc_view;
acc_view.view_offset(inum,dev_nbor,inum*2);
ucl_copy(acc_view,host_acc,true);
UCL_H_Vec<int> host_view;
host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
for (int ii=0; ii<inum; ii++) {
int i=ilist[ii];
host_view[i] = ii;
}
ucl_copy(dev_acc,host_view,true);
time_nbor.stop();
if (_use_packing==false) {

View File

@ -199,6 +199,8 @@ class Neighbor {
UCL_H_Vec<int> host_packed;
/// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
UCL_H_Vec<int> host_acc;
/// Device storage for accessing atom indices from the neighbor list (3-body)
UCL_D_Vec<int> dev_acc;
// ----------------- Data for GPU Neighbor Calculation ---------------

View File

@ -118,24 +118,24 @@ __kernel void transpose(__global tagint *restrict out,
const __global tagint *restrict in,
int columns_in, int rows_in)
{
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
unsigned ti=THREAD_ID_X;
unsigned tj=THREAD_ID_Y;
unsigned bi=BLOCK_ID_X;
unsigned bj=BLOCK_ID_Y;
unsigned ti=THREAD_ID_X;
unsigned tj=THREAD_ID_Y;
unsigned bi=BLOCK_ID_X;
unsigned bj=BLOCK_ID_Y;
unsigned i=bi*BLOCK_CELL_2D+ti;
unsigned j=bj*BLOCK_CELL_2D+tj;
if ((i<columns_in) && (j<rows_in))
block[tj][ti]=in[j*columns_in+i];
unsigned i=bi*BLOCK_CELL_2D+ti;
unsigned j=bj*BLOCK_CELL_2D+tj;
if ((i<columns_in) && (j<rows_in))
block[tj][ti]=in[j*columns_in+i];
__syncthreads();
__syncthreads();
i=bj*BLOCK_CELL_2D+ti;
j=bi*BLOCK_CELL_2D+tj;
if ((i<rows_in) && (j<columns_in))
out[j*rows_in+i] = block[ti][tj];
i=bj*BLOCK_CELL_2D+ti;
j=bi*BLOCK_CELL_2D+tj;
if ((i<rows_in) && (j<columns_in))
out[j*rows_in+i] = block[ti][tj];
}
__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
@ -191,7 +191,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
nbor_list[pid_i]=pid_i;
} else {
stride=0;
neigh_counts=host_numj+pid_i-inum;
neigh_counts=host_numj+pid_i-inum;
neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
}
@ -243,8 +243,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
}
}
}
__syncthreads();
} // for (k)
__syncthreads();
} // for (k)
}
}
}

View File

@ -51,7 +51,7 @@ void NeighborShared::clear() {
void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
const std::string flags) {
if (_compiled)
return;
return;
_gpu_nbor=gpu_nbor;
if (_gpu_nbor==0) {

View File

@ -270,19 +270,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
int my=mz+fast_mul(ny,npts_x);
for (int m=0; m<order; m++) {
grdtyp y0=z0*rho1d_1[m][tid];
for (int l=0; l<order; l++) {
grdtyp x0=y0*rho1d_0[l][tid];
grdtyp4 el=brick[my+l];
ek.x-=x0*el.x;
ek.y-=x0*el.y;
ek.z-=x0*el.z;
}
for (int l=0; l<order; l++) {
grdtyp x0=y0*rho1d_0[l][tid];
grdtyp4 el=brick[my+l];
ek.x-=x0*el.x;
ek.y-=x0*el.y;
ek.z-=x0*el.z;
}
my+=npts_x;
}
mz+=npts_yx;
}
}
}
ans[ii]=ek;
}
}
}

View File

@ -115,6 +115,14 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
#define OCL_DEFAULT_VENDOR "generic"
#endif
#ifdef INTEL_OCL
#define OCL_DEFAULT_VENDOR "intel"
#endif
#ifdef PHI_OCL
#define OCL_DEFAULT_VENDOR "phi"
#endif
#ifndef OCL_DEFAULT_VENDOR
#define OCL_DEFAULT_VENDOR "none"
#endif

View File

@ -81,19 +81,19 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
sigma_epsilon.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
host_sigma,host_epsilon);
host_sigma,host_epsilon);
this->cut_form.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack2(ntypes,lj_types,this->cut_form,host_write,
host_cutsq,h_form);
host_cutsq,h_form);
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
host_cutsq,h_form);
host_cutsq,h_form);
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
host_offset);
host_offset);
dev_error.alloc(1,*(this->ucl_device),UCL_WRITE_ONLY);
dev_error.zero();
@ -197,7 +197,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
(BX/this->_threads_per_atom)));
NGX=static_cast<int>(ceil(static_cast<double>(this->_last_ellipse)/BX));
this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_ELLIPSE,
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor1.stop();
this->time_ellipsoid.start();
@ -214,7 +214,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
// ------------ ELLIPSE_SPHERE ---------------
this->time_nbor2.start();
this->pack_nbors(NGX,BX, 0, this->_last_ellipse,ELLIPSE_SPHERE,
ELLIPSE_SPHERE,_shared_types,_lj_types);
ELLIPSE_SPHERE,_shared_types,_lj_types);
this->time_nbor2.stop();
this->time_ellipsoid2.start();
@ -245,7 +245,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum()-
this->_last_ellipse)/BX));
this->pack_nbors(NGX,BX,this->_last_ellipse,this->ans->inum(),
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
SPHERE_ELLIPSE,SPHERE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor3.stop();
this->time_ellipsoid3.start();
@ -300,7 +300,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
NGX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
this->time_nbor1.start();
this->pack_nbors(NGX, BX, 0, this->ans->inum(),SPHERE_SPHERE,
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
ELLIPSE_ELLIPSE,_shared_types,_lj_types);
this->time_nbor1.stop();
this->time_ellipsoid.start();
this->k_ellipsoid.set_size(GX,BX);

View File

@ -74,7 +74,7 @@ int SoftT::init(const int ntypes, double **host_cutsq,
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_prefactor,
host_cut,host_cutsq);
host_cut,host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
@ -98,7 +98,7 @@ void SoftT::reinit(const int ntypes, double **host_cutsq,
host_write[i]=0.0;
this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_prefactor,
host_cut,host_cutsq);
host_cut,host_cutsq);
}
template <class numtyp, class acctyp>

View File

@ -196,11 +196,12 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1
// this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
// this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
int ainum=this->ans->inum();
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
this->k_pair.set_size(GX,BX);
this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
@ -230,18 +231,21 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
&map, &elem2param, &_nelements,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();
}

View File

@ -195,7 +195,6 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
numtyp sw_powerq=sw2_ijparam.w;
numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
numtyp sw_cut=sw3_ijparam.x;
numtyp sw_cutsq=sw3_ijparam.y;
numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
pow(sw_sigma,sw_powerp);
numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
@ -345,7 +344,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
const int t_per_atom, const int evatom) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -394,8 +392,6 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
@ -419,15 +415,11 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
if (rsq2 < sw3_ikparam.y) { // sw_cutsq=sw3[ikparam].y;
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -467,14 +459,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -522,18 +514,20 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -559,15 +553,11 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
@ -607,14 +597,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
const int nelements,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp sw_epsilon, sw_sigma, sw_lambda, sw_gamma;
numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
@ -662,18 +652,20 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (rsq1 > sw3_ijparam.y) continue;
numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
sw_sigma=sw1_ijparam.y;
sw_gamma=sw1_ijparam.w;
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
sw_cut_ij=sw3_ijparam.x;
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -699,15 +691,11 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
if (rsq2 < sw3_ikparam.y) {
numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
sw_sigma=sw1_ikparam.y;
sw_gamma=sw1_ikparam.w;
sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
sw_cut_ik=sw3_ikparam.x;
int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
sw_epsilon=sw1_ijkparam.x;
sw_lambda=sw1_ijkparam.z;
sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);

View File

@ -269,7 +269,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
else
_eflag=0;
int ainum=nall;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffT::compute(const int f_ago, const int nlocal, const int nall,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffT::compute(const int ago, const int inum_full,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();

View File

@ -184,7 +184,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const int eflag, const int nall, const int inum,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<nall) {
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
@ -597,11 +597,12 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -666,13 +667,17 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -810,7 +815,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
__kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict ts1_in,
const __global numtyp4 *restrict ts2_in,
const __global numtyp4 *restrict ts4_in,
const __global numtyp4 *restrict ts4_in,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
@ -818,11 +823,12 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -887,13 +893,17 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -964,7 +974,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp delr2[3];
delr2[0] = kx.x-jx.x;
delr2[1] = kx.y-jx.y;
delr2[1] = kx.y-jx.y;
delr2[2] = kx.z-jx.z;
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

View File

@ -186,7 +186,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
if (tmp > param_c2)
return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
// error in negligible 2nd term fixed 9/30/2015
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
ucl_powr(tmp,-param_powern)));
if (tmp < param_c4) return (numtyp)0.0;

View File

@ -269,7 +269,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
else
_eflag=0;
int ainum=nall;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -279,7 +279,7 @@ void TersoffMT::compute(const int f_ago, const int nlocal, const int nall,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -364,7 +364,7 @@ int ** TersoffMT::compute(const int ago, const int inum_full,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -437,16 +437,18 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();

View File

@ -184,7 +184,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const int eflag, const int nall, const int inum,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -210,7 +210,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<nall) {
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
@ -605,11 +605,12 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -676,13 +677,17 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -826,8 +831,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
__kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict ts1_in,
const __global numtyp4 *restrict ts2_in,
const __global numtyp4 *restrict ts4_in,
const __global numtyp4 *restrict ts5_in,
const __global numtyp4 *restrict ts4_in,
const __global numtyp4 *restrict ts5_in,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
@ -835,11 +840,12 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
@ -906,13 +912,17 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -983,7 +993,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp delr2[3];
delr2[0] = kx.x-jx.x;
delr2[1] = kx.y-jx.y;
delr2[1] = kx.y-jx.y;
delr2[2] = kx.z-jx.z;
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

View File

@ -180,12 +180,12 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
{
numtyp tmp = param_beta * zeta;
if (tmp > param_ca1) return (numtyp)-0.5*(param_powern/param_powern_del) *
ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
ucl_powr(tmp,(numtyp)-0.5*(param_powern/param_powern_del)) / zeta;
if (tmp < param_ca4) return (numtyp)0.0;
numtyp tmp_n = ucl_powr(tmp,param_powern);
return (numtyp)-0.5 *(param_powern/param_powern_del) *
ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
ucl_powr((numtyp)1.0+tmp_n, (numtyp)-1.0-((numtyp)1.0 /
((numtyp)2.0*param_powern_del)))*tmp_n / zeta;
}

View File

@ -294,7 +294,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
else
_eflag=0;
int ainum=nall;
int ainum=nlist;
int nbor_pitch=this->nbor->nbor_pitch();
int BX=this->block_pair();
int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
@ -304,7 +304,7 @@ void TersoffZT::compute(const int f_ago, const int nlocal, const int nall,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -389,7 +389,7 @@ int ** TersoffZT::compute(const int ago, const int inum_full,
this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&_eflag, &nall, &ainum, &nbor_pitch, &this->_threads_per_atom);
&_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
int evatom=0;
if (eatom || vatom)
@ -463,16 +463,18 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
} else {
this->k_three_end.set_size(GX,BX);
this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
&map, &elem2param, &_nelements, &_nparams, &_zetaij,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&this->nbor->dev_acc,
&end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
&nbor_pitch, &this->_threads_per_atom);
&nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
}
this->time_pair.stop();

View File

@ -188,7 +188,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__global acctyp4 * zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const int eflag, const int nall, const int inum,
const int eflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
__local int tpa_sq,n_stride;
tpa_sq = fast_mul(t_per_atom,t_per_atom);
@ -216,7 +216,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<nall) {
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
@ -617,11 +617,12 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -686,13 +687,17 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -830,7 +835,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
__kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict ts1_in,
const __global numtyp4 *restrict ts2_in,
const __global numtyp4 *restrict ts4_in,
const __global numtyp4 *restrict ts4_in,
const __global numtyp *restrict cutsq,
const __global int *restrict map,
const __global int *restrict elem2param,
@ -838,11 +843,12 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
const __global acctyp4 *restrict zetaij,
const __global int * dev_nbor,
const __global int * dev_packed,
const __global int * dev_acc,
__global acctyp4 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag,
const int inum, const int nbor_pitch,
const int t_per_atom) {
const int t_per_atom, const int gpu_nbor) {
__local int tpa_sq, n_stride;
tpa_sq=fast_mul(t_per_atom,t_per_atom);
numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
@ -907,13 +913,17 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
mdelr1[1] = -delr1[1];
mdelr1[2] = -delr1[2];
int nbor_k=j+nbor_pitch;
int numk=dev_nbor[nbor_k];
int nbor_k,numk;
if (dev_nbor==dev_packed) {
if (gpu_nbor) nbor_k=j+nbor_pitch;
else nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
nbor_k+=offset_k;
} else {
nbor_k=dev_acc[j]+nbor_pitch;
numk=dev_nbor[nbor_k];
nbor_k+=nbor_pitch;
nbor_k=dev_nbor[nbor_k];
k_end=nbor_k+numk;
@ -984,7 +994,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp delr2[3];
delr2[0] = kx.x-jx.x;
delr2[1] = kx.y-jx.y;
delr2[1] = kx.y-jx.y;
delr2[2] = kx.z-jx.z;
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];

View File

@ -212,7 +212,7 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
if (tmp > param_c2)
return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
// error in negligible 2nd term fixed 9/30/2015
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
// (1.0 - 0.5*(1.0 + 1.0/(2.0*param->powern)) *
((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
ucl_powr(tmp,-param_powern)));
if (tmp < param_c4) return (numtyp)0.0;

View File

@ -75,7 +75,7 @@ int YukawaT::init(const int ntypes,
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,host_offset,
host_cutsq);
host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -96,7 +96,7 @@ int YukawaColloidT::init(const int ntypes,
coeff.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a,
host_offset,host_cutsq);
host_offset,host_cutsq);
UCL_H_Vec<double> dview;
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);

View File

@ -89,10 +89,10 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
if (rsq<coeff[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
force = factor_lj*force * rinv;
force = factor_lj*force * rinv;
f.x+=delx*force;
f.y+=dely*force;
@ -181,10 +181,10 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
if (rsq<coeff[mtype].z) {
numtyp r = ucl_sqrt(rsq);
numtyp rinv = ucl_recip(r);
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
numtyp screening = ucl_exp(-kappa*(r-(radi+radj)));
numtyp force = coeff[mtype].x * screening;
force = factor_lj*force * rinv;
force = factor_lj*force * rinv;
f.x+=delx*force;
f.y+=dely*force;

View File

@ -79,11 +79,11 @@ int ZBLT::init(const int ntypes, double **host_cutsq,
coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff1,host_write,host_sw1,host_sw2,
host_zze, host_cutsq);
host_zze, host_cutsq);
coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_d1a,host_d2a,
host_d3a,host_d4a);
host_d3a,host_d4a);
coeff3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
this->atom->type_pack4(ntypes,lj_types,coeff3,host_write,host_sw3,host_sw4,host_sw5);

View File

@ -134,10 +134,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
if (rsq>cut_innersq) {
t = r - cut_inner;
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
if (rsq>cut_innersq) {
t = r - cut_inner;
force = t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
force *= (numtyp)-1.0*ucl_recip(r);
@ -148,10 +148,10 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
if (eflag>0) {
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
energy+=e;
}
@ -237,10 +237,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
force = dzbldr(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
if (rsq>cut_innersq) {
t = r - cut_inner;
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
if (rsq>cut_innersq) {
t = r - cut_inner;
force += t*t * (coeff1[mtype].x + coeff1[mtype].y*t);
}
force *= (numtyp)-1.0*ucl_recip(r);
@ -251,10 +251,10 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
if (eflag>0) {
numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
e += coeff3[mtype].z;
if (rsq > cut_innersq) {
e += t*t*t * (coeff3[mtype].x + coeff3[mtype].y*t);
}
energy+=e;
}