From 209999068ced41194ffdb3b1b1ee7837eb3478fc Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 21 Sep 2012 15:57:23 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8810 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Nvidia.makefile | 2 +- lib/gpu/Opencl.makefile | 31 ++-- lib/gpu/lal_atom.cu | 6 +- lib/gpu/lal_aux_fun1.h | 71 +++++++++ lib/gpu/lal_base_atomic.cpp | 2 +- lib/gpu/lal_base_charge.cpp | 2 +- lib/gpu/lal_base_dipole.cpp | 2 +- lib/gpu/lal_base_ellipsoid.cpp | 7 +- lib/gpu/lal_born.cu | 43 +++--- lib/gpu/lal_born_coul_long.cu | 43 +++--- lib/gpu/lal_born_coul_wolf.cu | 93 +++++++----- lib/gpu/lal_buck.cu | 39 +++-- lib/gpu/lal_buck_coul.cu | 47 +++--- lib/gpu/lal_buck_coul_long.cu | 54 ++++--- lib/gpu/lal_cg_cmm.cu | 39 +++-- lib/gpu/lal_cg_cmm_long.cu | 49 +++--- lib/gpu/lal_charmm_long.cu | 58 ++++--- lib/gpu/lal_colloid.cu | 76 ++++++---- lib/gpu/lal_coul_dsf.cu | 52 +++++-- lib/gpu/lal_coul_long.cu | 247 +++++++++++++++--------------- lib/gpu/lal_device.cpp | 5 +- lib/gpu/lal_device.cu | 4 +- lib/gpu/lal_device.h | 4 +- lib/gpu/lal_dipole_lj.cu | 120 ++++++++++----- lib/gpu/lal_dipole_lj_sf.cu | 122 ++++++++++----- lib/gpu/lal_eam.cpp | 16 +- lib/gpu/lal_eam.cu | 186 ++++++++++++++++------- lib/gpu/lal_eam.h | 10 +- lib/gpu/lal_eam_ext.cpp | 14 +- lib/gpu/lal_ellipsoid_extra.h | 131 +++++++++------- lib/gpu/lal_ellipsoid_nbor.cu | 30 ++-- lib/gpu/lal_gauss.cu | 28 ++-- lib/gpu/lal_gayberne.cu | 23 ++- lib/gpu/lal_gayberne_lj.cu | 65 +++++--- lib/gpu/lal_lj.cu | 39 +++-- lib/gpu/lal_lj96.cu | 39 +++-- lib/gpu/lal_lj_class2_long.cu | 51 ++++--- lib/gpu/lal_lj_coul.cu | 49 +++--- lib/gpu/lal_lj_coul_debye.cu | 53 ++++--- lib/gpu/lal_lj_coul_long.cu | 49 +++--- lib/gpu/lal_lj_dsf.cu | 58 ++++--- lib/gpu/lal_lj_expand.cu | 33 ++-- lib/gpu/lal_morse.cu | 39 +++-- lib/gpu/lal_neighbor_cpu.cu | 7 +- lib/gpu/lal_neighbor_gpu.cu | 26 ++-- lib/gpu/lal_pppm.cu | 27 ++-- lib/gpu/lal_preprocessor.h | 22 ++- lib/gpu/lal_re_squared.cu | 24 +-- lib/gpu/lal_re_squared_lj.cu | 270 +++++++++++++++++++++------------ lib/gpu/lal_table.cu | 230 +++++++++++++++------------- lib/gpu/lal_yukawa.cu | 37 +++-- lib/gpu/lal_yukawa_colloid.cpp | 2 +- lib/gpu/lal_yukawa_colloid.cu | 38 +++-- 53 files changed, 1733 insertions(+), 1081 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 31c687369a..495ff7d054 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -87,7 +87,7 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \ $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \ $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \ - $(OBJ_DIR)/buck_coul_wolf.cubin $(OBJ_DIR)/buck_coul_wolf_cubin.h \ + $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul_cubin.h \ $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \ $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \ $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \ diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index 51bd78fbd7..4ace9bd3c1 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -64,16 +64,11 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \ $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \ $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \ - $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \ - $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \ - $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \ - $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \ - $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \ - $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \ - $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \ - $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \ - $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \ - $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h + $(OBJ_DIR)/born_cl.h $(OBJ_DIR)/born_coul_wolf_cl.h \ + $(OBJ_DIR)/born_coul_long_cl.h $(OBJ_DIR)/dipole_lj_cl.h \ + $(OBJ_DIR)/dipole_lj_sf_cl.h $(OBJ_DIR)/colloid_cl.h \ + $(OBJ_DIR)/gauss_cl.h $(OBJ_DIR)/yukawa_colloid_cl.h \ + $(OBJ_DIR)/lj_coul_debye_cl.h $(OBJ_DIR)/coul_dsf_cl.h OCL_EXECS = $(BIN_DIR)/ocl_get_devices @@ -131,11 +126,11 @@ $(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h $(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h -$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_preprocessor.h - $(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h; +$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh gayberne lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h; -$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h - $(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h; +$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh gayberne_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h; $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/gayberne_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o $(OCL) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR) @@ -143,11 +138,11 @@ $(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/g $(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp $(OCL) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR) -$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_preprocessor.h - $(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h; +$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh re_squared lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h; -$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_preprocessor.h - $(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h; +$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu lal_ellipsoid_extra.h lal_aux_fun1.h lal_preprocessor.h + $(BSH) ./geryon/file_to_cstr.sh re_squared_lj lal_preprocessor.h lal_aux_fun1.h lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h; $(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/re_squared_lj_cl.h $(OBJ_DIR)/lal_base_ellipsoid.o $(OCL) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR) diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu index 3446c1d4e0..2a78719ffb 100644 --- a/lib/gpu/lal_atom.cu +++ b/lib/gpu/lal_atom.cu @@ -17,8 +17,10 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x, - __global int *type, const int nall) { +__kernel void kernel_cast_x(__global numtyp4 *restrict x_type, + const __global double *restrict x, + const __global int *restrict type, + const int nall) { int ii=GLOBAL_ID_X; if (ii1) { \ @@ -137,3 +139,72 @@ ans[ii]=f; \ } +#else + +#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ + eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + +#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + e_coul += shfl_xor(e_coul, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + *engv=e_coul; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + +#endif + diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index f88c4417af..6597dbfe98 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -132,7 +132,7 @@ int * BaseAtomicT::reset_nbors(const int nall, const int inum, int *ilist, resize_atom(inum,nall,success); resize_local(inum,mn,success); if (!success) - return false; + return NULL; nbor->get_host(inum,ilist,numj,firstneigh,block_size()); diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index f9bb2a52f3..f61950cfee 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -135,7 +135,7 @@ int * BaseChargeT::reset_nbors(const int nall, const int inum, int *ilist, resize_atom(inum,nall,success); resize_local(inum,mn,success); if (!success) - return false; + return NULL; nbor->get_host(inum,ilist,numj,firstneigh,block_size()); diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 7c090e6351..8c793f554e 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -137,7 +137,7 @@ int * BaseDipoleT::reset_nbors(const int nall, const int inum, int *ilist, resize_atom(inum,nall,success); resize_local(inum,mn,success); if (!success) - return false; + return NULL; nbor->get_host(inum,ilist,numj,firstneigh,block_size()); diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 7e86d03e50..2b45b109db 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -80,7 +80,7 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, ucl_device=device->gpu; atom=&device->atom; - _block_size=device->pair_block_size(); + _block_size=device->block_ellipse(); compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere); // Initialize host-device load balancer @@ -118,9 +118,8 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, ans->force.zero(); // Memory for ilist ordered by particle type - if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS) - return 0; - else return -3; + if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS) + return -3; _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu index 3410a5effd..f04b79e142 100644 --- a/lib/gpu/lal_born.cu +++ b/lib/gpu/lal_born.cu @@ -24,14 +24,18 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1, - __global numtyp4* coeff2, - __global numtyp2 *cutsq_sigma, - const int lj_types, __global numtyp *sp_lj_in, - __global int *dev_nbor, __global int *dev_packed, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom) { +__kernel void k_born(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1, + const __global numtyp4 *restrict coeff2, + const __global numtyp2 *restrict cutsq_sigma, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -49,7 +53,7 @@ __kernel void k_born(__global numtyp4 *x_, __global numtyp4 *coeff1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1, - __global numtyp4* coeff2, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, - __global numtyp4 *cutsq_sigma, +__kernel void k_born_long(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -61,7 +66,7 @@ __kernel void k_born_long(__global numtyp4 *x_, __global numtyp4 *coeff1, virial[i]=(acctyp)0; if (ii q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1, - __global numtyp4* coeff2, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, - __global numtyp4 *cutsq_sigma, +__kernel void k_born_wolf(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp alf, const numtyp e_shift, const numtyp f_shift, const int t_per_atom) { @@ -64,7 +69,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1, virial[i]=(acctyp)0; if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * + qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -83,7 +89,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1, numtyp factor_lj, factor_coul; factor_lj = sp_lj[sbmask(j)]; - factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + factor_coul = sp_lj[sbmask(j)+4]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -98,7 +104,7 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1, int mtype=itype*lj_types+jtype; if (rsq0) { if (rsq < cut_coulsq) - e_coul += prefactor*(v_sh-factor_coul); - if (rsq < coeff1[mtype].w) { + e_coul += v_sh*factor_coul; + if (rsq < cutsq_sigma[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; energy+=factor_lj*(e-coeff2[mtype].w); @@ -154,14 +160,18 @@ __kernel void k_born_wolf(__global numtyp4 *x_, __global numtyp4 *coeff1, } // if ii } -__kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in, - __global numtyp4* coeff2_in, - __global numtyp* sp_lj_in, - __global int *dev_nbor, __global int *dev_packed, - __global acctyp4 *ans, __global acctyp *engv, +__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, - __global numtyp4 *cutsq_sigma, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp4 *restrict cutsq_sigma, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp alf, const numtyp e_shift, const numtyp f_shift, const int t_per_atom) { @@ -190,7 +200,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in __syncthreads(); if (ii0) { - acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) * + qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; e_coul += (acctyp)2.0*e_self; } @@ -210,7 +221,7 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in numtyp factor_lj, factor_coul; factor_lj = sp_lj[sbmask(j)]; - factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4]; + factor_coul = sp_lj[sbmask(j)+4]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -224,11 +235,11 @@ __kernel void k_born_wolf_fast(__global numtyp4 *x_, __global numtyp4 *coeff1_in if (rsq0) { if (rsq < cut_coulsq) - e_coul += prefactor*(v_sh-factor_coul); - if (rsq < coeff1[mtype].w) { + e_coul += v_sh*factor_coul; + if (rsq < cutsq_sigma[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv + coeff2[mtype].z*r2inv*r6inv; energy+=factor_lj*(e-coeff2[mtype].w); diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu index b0c817ad35..8da8587f58 100644 --- a/lib/gpu/lal_buck.cu +++ b/lib/gpu/lal_buck.cu @@ -24,13 +24,17 @@ texture pos_tex; #define pos_tex x_ #endif -__kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1, - __global numtyp4* coeff2, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom) { +__kernel void k_buck(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -48,7 +52,7 @@ __kernel void k_buck(__global numtyp4 *x_, __global numtyp4 *coeff1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1, - __global numtyp4* coeff2, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , - __global numtyp4 *cutsq, const numtyp qqrd2e, - const int t_per_atom) { +__kernel void k_buck_coul(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_ , + const __global numtyp4 *restrict cutsq, + const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -60,7 +65,7 @@ __kernel void k_buck_coul(__global numtyp4 *x_, __global numtyp4 *coeff1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1, - __global numtyp4* coeff2, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, - __global numtyp *cutsq, - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const int t_per_atom) { +__kernel void k_buck_coul_long(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff1, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -61,7 +66,7 @@ __kernel void k_buck_coul_long(__global numtyp4 *x_, __global numtyp4 *coeff1, virial[i]=(acctyp)0; if (ii pos_tex; #define pos_tex x_ #endif -__kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom) { +__kernel void k_cg_cmm(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -48,7 +52,7 @@ __kernel void k_cg_cmm(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const int t_per_atom) { +__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_ , + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -60,7 +65,7 @@ __kernel void k_cg_cmm_long(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1, - const int lj_types, __global numtyp *sp_lj_in, - __global int *dev_nbor, __global int *dev_packed, - __global acctyp4 *ans, __global acctyp *engv, - const int eflag, const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const numtyp denom_lj, - const numtyp cut_bothsq, const numtyp cut_ljsq, - const numtyp cut_lj_innersq, const int t_per_atom) { +__kernel void k_charmm_long(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp denom_lj, + const numtyp cut_bothsq, const numtyp cut_ljsq, + const numtyp cut_lj_innersq, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -61,7 +66,7 @@ __kernel void k_charmm_long(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii pos_tex; #define pos_tex x_ #endif -__kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, - __global numtyp4* colloid1, - __global numtyp4* colloid2, - __global int *form, - __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, +__kernel void k_colloid(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global numtyp4 *restrict colloid1, + const __global numtyp4 *restrict colloid2, + const __global int *form, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -52,7 +55,7 @@ __kernel void k_colloid(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , +__kernel void k_coul_dsf(const __global numtyp4 *restrict x_, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_ , const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { @@ -58,7 +62,7 @@ __kernel void k_coul_dsf(__global numtyp4 *x_, const int lj_types, virial[i]=(acctyp)0; if (ii0) { + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + e_coul += (acctyp)2.0*e_self; + } + for ( ; nbor0) { + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + e_coul += (acctyp)2.0*e_self; + } for ( ; nbor q_tex; #define q_tex q_ #endif -__kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_cl_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, +#if (ARCH < 300) + +#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + __local acctyp red_acc[6][BLOCK_PAIR]; \ + \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=e_coul; \ + \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<4; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + e_coul=red_acc[3][tid]; \ + \ + if (vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + } \ + } \ + \ + if (offset==0) { \ + __global acctyp *ap1=engv+ii; \ + if (eflag>0) { \ + *ap1=(acctyp)0; \ + ap1+=inum; \ + *ap1=e_coul; \ + ap1+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *ap1=virial[i]; \ + ap1+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + +#else + +#define store_answers_lq(f, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + e_coul += shfl_xor(e_coul, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + __global acctyp *ap1=engv+ii; \ + if (eflag>0) { \ + *ap1=(acctyp)0; \ + ap1+=inum; \ + *ap1=e_coul; \ + ap1+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *ap1=virial[i]; \ + ap1+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + +#endif + +__kernel void k_coul_long(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_cl_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -55,7 +154,7 @@ __kernel void k_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii1) { - __local acctyp red_acc[6][BLOCK_PAIR]; - - red_acc[0][tid]=f.x; - red_acc[1][tid]=f.y; - red_acc[2][tid]=f.z; - red_acc[3][tid]=e_coul; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<4; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - f.x=red_acc[0][tid]; - f.y=red_acc[1][tid]; - f.z=red_acc[2][tid]; - e_coul=red_acc[3][tid]; - - if (vflag>0) { - for (int r=0; r<6; r++) - red_acc[r][tid]=virial[r]; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<6; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - for (int r=0; r<6; r++) - virial[r]=red_acc[r][tid]; - } - } - - // Store answers - if (offset==0) { - __global acctyp *ap1=engv+ii; - if (eflag>0) { - *ap1=(acctyp)0; - ap1+=inum; - *ap1=e_coul; - ap1+=inum; - } - if (vflag>0) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1+=inum; - } - } - ans[ii]=f; - } + store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } // if ii } -__kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, - __global numtyp4* lj3_in, - __global numtyp* sp_cl_in, - __global int *dev_nbor, __global int *dev_packed, - __global acctyp4 *ans, __global acctyp *engv, +__kernel void k_coul_long_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1_in, + const __global numtyp4 *restrict lj3_in, + const __global numtyp *restrict sp_cl_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, + const int nbor_pitch, + const __global numtyp *restrict q_, const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; @@ -193,7 +243,7 @@ __kernel void k_coul_long_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, __syncthreads(); if (ii1) { - __local acctyp red_acc[6][BLOCK_PAIR]; - - red_acc[0][tid]=f.x; - red_acc[1][tid]=f.y; - red_acc[2][tid]=f.z; - red_acc[3][tid]=e_coul; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<4; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - f.x=red_acc[0][tid]; - f.y=red_acc[1][tid]; - f.z=red_acc[2][tid]; - e_coul=red_acc[3][tid]; - - if (vflag>0) { - for (int r=0; r<6; r++) - red_acc[r][tid]=virial[r]; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<6; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - for (int r=0; r<6; r++) - virial[r]=red_acc[r][tid]; - } - } - - // Store answers - if (offset==0) { - __global acctyp *ap1=engv+ii; - if (eflag>0) { - *ap1=(acctyp)0; - ap1+=inum; - *ap1=e_coul; - ap1+=inum; - } - if (vflag>0) { - for (int i=0; i<6; i++) { - *ap1=virial[i]; - ap1+=inum; - } - } - ans[ii]=f; - } + store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } // if ii } diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 3952495393..979ff2cbed 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -580,14 +580,14 @@ int DeviceT::compile_kernels() { k_info.set_function(*dev_program,"kernel_info"); _compiled=true; - UCL_Vector gpu_lib_data(14,*gpu,UCL_NOT_PINNED); + UCL_Vector gpu_lib_data(15,*gpu,UCL_NOT_PINNED); k_info.set_size(1,1); k_info.run(&gpu_lib_data); gpu_lib_data.update_host(false); _ptx_arch=static_cast(gpu_lib_data[0])/100.0; #ifndef USE_OPENCL - if (_ptx_arch>gpu->arch()) + if (_ptx_arch>gpu->arch() || floor(_ptx_arch)arch())) return -4; #endif @@ -606,6 +606,7 @@ int DeviceT::compile_kernels() { _block_nbor_build=gpu_lib_data[10]; _block_bio_pair=gpu_lib_data[11]; _max_bio_shared_types=gpu_lib_data[12]; + _block_ellipse=gpu_lib_data[14]; if (static_cast(_block_pair)>gpu->group_size()) _block_pair=gpu->group_size(); diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu index 54a95417be..28b58f7760 100644 --- a/lib/gpu/lal_device.cu +++ b/lib/gpu/lal_device.cu @@ -17,7 +17,8 @@ #include "lal_preprocessor.h" #endif -__kernel void kernel_zero(__global int *mem, int numel) { +__kernel void kernel_zero(__global int *restrict mem, + int numel) { int ii=GLOBAL_ID_X; if (ii pos_tex; +texture q_tex; +texture mu_tex; +#else +texture pos_tex; +texture q_tex; +texture mu_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#define mu_tex mu_ +#endif + +#if (ARCH < 300) #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \ t_per_atom, offset, eflag, vflag, ans, engv) \ @@ -73,32 +90,63 @@ ans[ii+inum]=tor; \ } -#ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; -texture mu_tex; #else -texture pos_tex; -texture q_tex; -texture mu_tex; + +#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + tor.x += shfl_xor(tor.x, s, t_per_atom); \ + tor.y += shfl_xor(tor.y, s, t_per_atom); \ + tor.z += shfl_xor(tor.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + e_coul += shfl_xor(e_coul, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + *engv=e_coul; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + ans[ii+inum]=tor; \ + } + #endif -#else -#define pos_tex x_ -#define q_tex q_ -#define mu_tex mu_ -#endif - -__kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , - __global numtyp4 *mu_, - __global numtyp *cutsq, const numtyp qqrd2e, - const int t_per_atom) { +__kernel void k_dipole_lj(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp4 *restrict mu_, + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -125,7 +173,7 @@ __kernel void k_dipole_lj(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii pos_tex; +texture q_tex; +texture mu_tex; +#else +texture pos_tex; +texture q_tex; +texture mu_tex; +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#define mu_tex mu_ +#endif + +#if (ARCH < 300) + #define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid, \ t_per_atom, offset, eflag, vflag, ans, engv) \ if (t_per_atom>1) { \ @@ -73,32 +91,63 @@ ans[ii+inum]=tor; \ } -#ifndef _DOUBLE_DOUBLE -texture pos_tex; -texture q_tex; -texture mu_tex; #else -texture pos_tex; -texture q_tex; -texture mu_tex; + +#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + tor.x += shfl_xor(tor.x, s, t_per_atom); \ + tor.y += shfl_xor(tor.y, s, t_per_atom); \ + tor.z += shfl_xor(tor.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + e_coul += shfl_xor(e_coul, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv=energy; \ + engv+=inum; \ + *engv=e_coul; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + ans[ii+inum]=tor; \ + } + #endif -#else -#define pos_tex x_ -#define q_tex q_ -#define mu_tex mu_ -#endif - -__kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , - __global numtyp4 *mu_, - __global numtyp *cutsq, const numtyp qqrd2e, - const int t_per_atom) { +__kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_ , + const __global numtyp4 *restrict mu_, + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -125,7 +174,7 @@ __kernel void k_dipole_lj_sf(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, - double ***host_frho_spline, double rdr, double rdrho, int nrhor, - int nrho, int nz2r, int nfrho, int nr, const int nlocal, - const int nall, const int max_nbors, const int maxspecial, - const double cell_size, const double gpu_split, FILE *_screen) + double ***host_frho_spline, double rdr, double rdrho, + double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, @@ -97,6 +98,7 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor, _cutforcesq=host_cutforcesq; _rdr=rdr; _rdrho = rdrho; + _rhomax=rhomax; _nrhor=nrhor; _nrho=nrho; _nz2r=nz2r; @@ -468,15 +470,15 @@ void EAMT::loop(const bool _eflag, const bool _vflag) { &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, &this->ans->engv, &eflag, &ainum, &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho, - &_nrho, &_nr, &this->_threads_per_atom); + &_rhomax, &_nrho, &_nr, &this->_threads_per_atom); } else { this->k_energy.set_size(GX,BX); this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho, &rhor_spline2, &frho_spline1, &frho_spline2, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &_fp, &this->ans->engv,&eflag, &ainum, &nbor_pitch, - &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_nrho, &_nr, - &this->_threads_per_atom); + &_ntypes, &_cutforcesq, &_rdr, &_rdrho, &_rhomax, &_nrho, + &_nr, &this->_threads_per_atom); } this->time_pair.stop(); diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu index ec20bd672f..aedbf7458f 100644 --- a/lib/gpu/lal_eam.cu +++ b/lib/gpu/lal_eam.cu @@ -52,8 +52,10 @@ texture z2r_sp2_tex; #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MAX(A,B) ((A) > (B) ? (A) : (B)) +#if (ARCH < 300) + #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, \ - eflag,vflag,engv,rdrho,nrho,i) \ + eflag,vflag,engv,rdrho,nrho,i,rhomax) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=rho; \ @@ -76,10 +78,11 @@ texture z2r_sp2_tex; if (eflag>0) { \ fetch4(coeff,index,frho_sp2_tex); \ energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ + if (rho > rhomax) energy += fp*(rho-rhomax); \ engv[ii]=(acctyp)2.0*energy; \ } \ } - + #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \ offset, elag, vflag, ans, engv) \ if (t_per_atom>1) { \ @@ -125,18 +128,80 @@ texture z2r_sp2_tex; ans[ii]=f; \ } -__kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r, - __global int *type2frho, - __global numtyp4 *rhor_spline2, - __global numtyp4 *frho_spline1, - __global numtyp4 *frho_spline2, - __global int *dev_nbor, __global int *dev_packed, - __global numtyp *fp_, __global acctyp *engv, - const int eflag, const int inum, - const int nbor_pitch, const int ntypes, - const numtyp cutforcesq, const numtyp rdr, - const numtyp rdrho, const int nrho, const int nr, - const int t_per_atom) { +#else + +#define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset, \ + eflag,vflag,engv,rdrho,nrho,i,rhomax) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) \ + rho += shfl_xor(rho, s, t_per_atom); \ + } \ + if (offset==0) { \ + numtyp p = rho*rdrho + (numtyp)1.0; \ + int m=p; \ + m = MAX(1,MIN(m,nrho-1)); \ + p -= m; \ + p = MIN(p,(numtyp)1.0); \ + int index = type2frho[itype]*(nrho+1)+m; \ + numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex); \ + numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z; \ + fp_[i]=fp; \ + if (eflag>0) { \ + fetch4(coeff,index,frho_sp2_tex); \ + energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w; \ + if (rho > rhomax) energy += fp*(rho-rhomax); \ + engv[ii]=(acctyp)2.0*energy; \ + } \ + } + +#define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom, \ + offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv+=energy; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv=virial[i]; \ + engv+=inum; \ + } \ + } \ + ans[ii]=f; \ + } + +#endif + +__kernel void k_energy(const __global numtyp4 *restrict x_, + const __global int2 *restrict type2rhor_z2r, + const __global int *restrict type2frho, + const __global numtyp4 *restrict rhor_spline2, + const __global numtyp4 *restrict frho_spline1, + const __global numtyp4 *restrict frho_spline2, + const __global int *dev_nbor, + const __global int *dev_packed, + __global numtyp *restrict fp_, + __global acctyp *restrict engv, + const int eflag, const int inum, const int nbor_pitch, + const int ntypes, const numtyp cutforcesq, + const numtyp rdr, const numtyp rdrho, + const numtyp rhomax, const int nrho, + const int nr, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -144,7 +209,7 @@ __kernel void k_energy(__global numtyp4 *x_, __global int2 *type2rhor_z2r, acctyp energy = (acctyp)0; if (ii { int init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, - double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr, - const int nlocal, const int nall, const int max_nbors, - const int maxspecial, const double cell_size, const double gpu_split, - FILE *_screen); + double rdrho, double rhomax, int nrhor, int nrho, int nz2r, + int nfrho, int nr, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen); // Copy charges to device asynchronously inline void add_fp_data() { @@ -112,7 +112,7 @@ class EAM : public BaseAtomic { UCL_D_Vec frho_spline1, frho_spline2; UCL_D_Vec rhor_spline1, rhor_spline2; - numtyp _cutforcesq,_rdr,_rdrho; + numtyp _cutforcesq,_rdr,_rdrho, _rhomax; int _nfrho,_nrhor,_nrho,_nz2r,_nr; diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index 359d949a2d..687a04529f 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -31,7 +31,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, - double rdr, double rdrho, int nrhor, + double rdr, double rdrho, double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -66,9 +66,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, if (world_me==0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho, - nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, - screen); + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, + nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, + gpu_split, screen); EAMMF.device->world_barrier(); if (message) @@ -86,9 +86,9 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq, if (gpu_rank==i && world_me!=0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, - host_frho_spline, rdr, rdrho, nrhor, nrho, nz2r, nfrho, - nr, nlocal, nall, 300, maxspecial, cell_size, - gpu_split, screen); + host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, + nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, + cell_size, gpu_split, screen); EAMMF.device->gpu_barrier(); if (message) diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h index e2287c0af2..8311f4d617 100644 --- a/lib/gpu/lal_ellipsoid_extra.h +++ b/lib/gpu/lal_ellipsoid_extra.h @@ -19,7 +19,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE}; #ifdef NV_KERNEL -#include "lal_preprocessor.h" +#include "lal_aux_fun1.h" #ifndef _DOUBLE_DOUBLE texture pos_tex, quat_tex; #else @@ -30,11 +30,6 @@ texture pos_tex, quat_tex; #define quat_tex qif #endif -#define atom_info(t_per_atom, ii, tid, offset) \ - tid=THREAD_ID_X; \ - offset=tid & (t_per_atom-1); \ - ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; - #define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ i, numj, stride, list_end, nbor) \ nbor=nbor_mem+ii; \ @@ -42,55 +37,11 @@ texture pos_tex, quat_tex; nbor+=nbor_stride; \ numj=*nbor; \ nbor+=nbor_stride; \ - list_end=nbor+fast_mul(nbor_stride,numj); \ - nbor+=fast_mul(offset,nbor_stride); \ + list_end=nbor+fast_mul(nbor_stride,numj); \ + nbor+=fast_mul(offset,nbor_stride); \ stride=fast_mul(t_per_atom,nbor_stride); -#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ - eflag, vflag, ans, engv) \ - if (t_per_atom>1) { \ - __local acctyp red_acc[6][BLOCK_PAIR]; \ - red_acc[0][tid]=f.x; \ - red_acc[1][tid]=f.y; \ - red_acc[2][tid]=f.z; \ - red_acc[3][tid]=energy; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<4; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - f.x=red_acc[0][tid]; \ - f.y=red_acc[1][tid]; \ - f.z=red_acc[2][tid]; \ - energy=red_acc[3][tid]; \ - if (vflag>0) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid]=virial[r]; \ - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ - if (offset < s) { \ - for (int r=0; r<6; r++) \ - red_acc[r][tid] += red_acc[r][tid+s]; \ - } \ - } \ - for (int r=0; r<6; r++) \ - virial[r]=red_acc[r][tid]; \ - } \ - } \ - if (offset==0) { \ - engv+=ii; \ - if (eflag>0) { \ - *engv=energy; \ - engv+=inum; \ - } \ - if (vflag>0) { \ - for (int i=0; i<6; i++) { \ - *engv=virial[i]; \ - engv+=inum; \ - } \ - } \ - ans[ii]=f; \ - } +#if (ARCH < 300) #define store_answers_t(f, tor, energy, virial, ii, astride, tid, \ t_per_atom, offset, eflag, vflag, ans, engv) \ @@ -195,6 +146,80 @@ texture pos_tex, quat_tex; ans[ii]=old; \ } +#else + +#define store_answers_t(f, tor, energy, virial, ii, astride, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + tor.x += shfl_xor(tor.x, s, t_per_atom); \ + tor.y += shfl_xor(tor.y, s, t_per_atom); \ + tor.z += shfl_xor(tor.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + __global acctyp *ap1=engv+ii; \ + if (eflag>0) { \ + *ap1=energy; \ + ap1+=astride; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *ap1=virial[i]; \ + ap1+=astride; \ + } \ + } \ + ans[ii]=f; \ + ans[ii+astride]=tor; \ + } + +#define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \ + eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + engv+=ii; \ + if (eflag>0) { \ + *engv+=energy; \ + engv+=inum; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *engv+=virial[i]; \ + engv+=inum; \ + } \ + } \ + acctyp4 old=ans[ii]; \ + old.x+=f.x; \ + old.y+=f.y; \ + old.z+=f.z; \ + ans[ii]=old; \ + } + +#endif + /* ---------------------------------------------------------------------- dot product of 2 vectors ------------------------------------------------------------------------- */ diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu index 0be6c0922d..47ee173a4b 100644 --- a/lib/gpu/lal_ellipsoid_nbor.cu +++ b/lib/gpu/lal_ellipsoid_nbor.cu @@ -29,22 +29,24 @@ texture pos_tex; // -- Only unpack neighbors matching the specified inclusive range of forms // -- Only unpack neighbors within cutoff // --------------------------------------------------------------------------- -__kernel void kernel_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, - const int ntypes, __global int *dev_nbor, +__kernel void kernel_nbor(const __global numtyp4 *restrict x_, + const __global numtyp2 *restrict cut_form, + const int ntypes, + __global int *dev_nbor, const int nbor_pitch, const int start, const int inum, - __global int *dev_ij, const int form_low, - const int form_high) { + const __global int *dev_ij, + const int form_low, const int form_high) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X+start; if (ii pos_tex; #define pos_tex x_ #endif -__kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1, +__kernel void k_gauss(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict gauss1, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -48,7 +51,7 @@ __kernel void k_gauss(__global numtyp4 *x_, __global numtyp4 *gauss1, virial[i]=(acctyp)0; if (ii pos_tex; #define pos_tex x_ #endif -__kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom) { +__kernel void k_lj(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -48,7 +52,7 @@ __kernel void k_lj(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii pos_tex; #define pos_tex x_ #endif -__kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom) { +__kernel void k_lj96(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -48,7 +52,7 @@ __kernel void k_lj96(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const int t_per_atom) { +__kernel void k_lj_class2_long(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -60,7 +65,7 @@ __kernel void k_lj_class2_long(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , - __global numtyp *cutsq, const numtyp qqrd2e, - const int t_per_atom) { +__kernel void k_lj_coul(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -60,7 +65,7 @@ __kernel void k_lj_coul(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , - __global numtyp *cutsq, const numtyp qqrd2e, - const numtyp kappa, - const int t_per_atom) { +__kernel void k_lj_debye(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_ , + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const numtyp kappa, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -61,7 +66,7 @@ __kernel void k_lj_debye_pair(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define q_tex q_ #endif -__kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_, - const numtyp cut_coulsq, const numtyp qqrd2e, - const numtyp g_ewald, const int t_per_atom) { +__kernel void k_lj_coul_long(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -60,7 +65,7 @@ __kernel void k_lj_coul_long(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii q_tex; #define MY_PIS (acctyp)1.77245385090551602729 -__kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, __global numtyp *q_ , +__kernel void k_lj_dsf(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_ , const numtyp cut_coulsq, const numtyp qqrd2e, const numtyp e_shift, const numtyp f_shift, const numtyp alpha, const int t_per_atom) { @@ -63,7 +68,7 @@ __kernel void k_lj_dsf(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii0) { + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + e_coul += (acctyp)2.0*e_self; + } + for ( ; nbor0) { + acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) * + qtmp*qtmp*qqrd2e/(acctyp)t_per_atom; + e_coul += (acctyp)2.0*e_self; + } + for ( ; nbor pos_tex; #define pos_tex x_ #endif -__kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1, - __global numtyp4* lj3, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, +__kernel void k_lj_expand(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -50,7 +54,7 @@ __kernel void k_lj_expand(__global numtyp4 *x_, __global numtyp4 *lj1, virial[i]=(acctyp)0; if (ii pos_tex; #define pos_tex x_ #endif -__kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1, - __global numtyp2* mor2, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom) { +__kernel void k_morse(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict mor1, + const __global numtyp2 *restrict mor2, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -50,7 +54,7 @@ __kernel void k_morse(__global numtyp4 *x_, __global numtyp4 *mor1, virial[i]=(acctyp)0; if (ii pos_tex; texture pos_tex; #endif -__kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id, +__kernel void calc_cell_id(const numtyp4 *restrict pos, + unsigned *restrict cell_id, + int *restrict particle_id, numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, numtyp i_cell_size, int ncellx, int ncelly, int ncellz, int inum, int nall, @@ -62,8 +64,9 @@ __kernel void calc_cell_id(numtyp4 *pos, unsigned *cell_id, int *particle_id, } } -__kernel void kernel_calc_cell_counts(unsigned *cell_id, - int *cell_counts, int nall, int ncell) { +__kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id, + int *restrict cell_counts, + int nall, int ncell) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < nall) { int id = cell_id[idx]; @@ -94,8 +97,9 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id, -__kernel void transpose(__global int *out, __global int *in, int columns_in, - int rows_in) +__kernel void transpose(__global int *restrict out, + const __global int *restrict in, + int columns_in, int rows_in) { __local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; @@ -117,9 +121,9 @@ __kernel void transpose(__global int *out, __global int *in, int columns_in, out[j*rows_in+i] = block[ti][tj]; } -__kernel void calc_neigh_list_cell(__global numtyp4 *x_, - __global int *cell_particle_id, - __global int *cell_counts, +__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, + const __global int *restrict cell_particle_id, + const __global int *restrict cell_counts, __global int *nbor_list, __global int *host_nbor_list, __global int *host_numj, @@ -234,8 +238,10 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_, __kernel void kernel_special(__global int *dev_nbor, __global int *host_nbor_list, - __global int *host_numj, __global int *tag, - __global int *nspecial, __global int *special, + const __global int *host_numj, + const __global int *restrict tag, + const __global int *restrict nspecial, + const __global int *restrict special, int inum, int nt, int max_nbors, int t_per_atom) { int tid=THREAD_ID_X; int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu index 646afa5900..0ed5c6bad5 100644 --- a/lib/gpu/lal_pppm.cu +++ b/lib/gpu/lal_pppm.cu @@ -41,15 +41,18 @@ texture q_tex; // Number of pencils per block for charge spread #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE) -__kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, +__kernel void particle_map(const __global numtyp4 *restrict x_, + const __global numtyp *restrict q_, const grdtyp delvolinv, const int nlocal, - __global int *counts, __global grdtyp4 *ans, + __global int *restrict counts, + __global grdtyp4 *restrict ans, const grdtyp b_lo_x, const grdtyp b_lo_y, const grdtyp b_lo_z, const grdtyp delxinv, const grdtyp delyinv, const grdtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, - const int max_atoms, __global int *error) { + const int max_atoms, + __global int *restrict error) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; @@ -97,8 +100,10 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, /* --------------------------- */ -__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms, - __global grdtyp *brick, __global grdtyp *_rho_coeff, +__kernel void make_rho(const __global int *restrict counts, + const __global grdtyp4 *restrict atoms, + __global grdtyp *restrict brick, + const __global grdtyp *restrict _rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int npts_z, const int nlocal_x, const int nlocal_y, const int nlocal_z, @@ -192,15 +197,17 @@ __kernel void make_rho(__global int *counts, __global grdtyp4 *atoms, } } -__kernel void interp(__global numtyp4 *x_, __global numtyp *q_, - const int nlocal, __global grdtyp4 *brick, - __global grdtyp *_rho_coeff, const int npts_x, - const int npts_yx, const grdtyp b_lo_x, +__kernel void interp(const __global numtyp4 *restrict x_, + const __global numtyp *restrict q_, + const int nlocal, + const __global grdtyp4 *restrict brick, + const __global grdtyp *restrict _rho_coeff, + const int npts_x, const int npts_yx, const grdtyp b_lo_x, const grdtyp b_lo_y, const grdtyp b_lo_z, const grdtyp delxinv, const grdtyp delyinv, const grdtyp delzinv, const int order, const int order2, const grdtyp qqrd2e_scale, - __global acctyp4 *ans) { + __global acctyp4 *restrict ans) { __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE]; __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index b817bbe551..f681268f1c 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -56,8 +56,7 @@ // Definition: Default thread block size for "bio" pair styles // MAX_BIO_SHARED_TYPES // Definition: Max # of atom type params can be stored in shared memory -// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 && -// MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR +// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 // //*************************************************************************/ @@ -80,6 +79,7 @@ #define __kernel extern "C" __global__ #define __local __shared__ #define __global +#define restrict __restrict__ #define atom_add atomicAdd #define ucl_inline static __inline__ __device__ @@ -116,8 +116,22 @@ #define BLOCK_NBOR_BUILD 128 #define BLOCK_PAIR 512 #define BLOCK_BIO_PAIR 512 +#define BLOCK_ELLIPSE 256 #define MAX_SHARED_TYPES 11 +#ifdef _SINGLE_SINGLE +#define shfl_xor __shfl_xor +#else +ucl_inline double shfl_xor(double var, int laneMask, int width) { + int2 tmp; + tmp.x = __double2hiint(var); + tmp.y = __double2loint(var); + tmp.x = __shfl_xor(tmp.x,laneMask,width); + tmp.y = __shfl_xor(tmp.y,laneMask,width); + return __hiloint2double(tmp.x,tmp.y); +} +#endif + #endif #endif @@ -380,3 +394,7 @@ typedef struct _double4 double4; #define NEIGHMASK 0x3FFFFFFF ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }; +#ifndef BLOCK_ELLIPSE +#define BLOCK_ELLIPSE BLOCK_PAIR +#endif + diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index c858b09801..28e15b5cdf 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -32,14 +32,20 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) return ans; } -__kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q, - __global numtyp4* shape, __global numtyp4* well, - __global numtyp *splj, __global numtyp2* sig_eps, - const int ntypes, __global int *dev_nbor, - const int stride, __global acctyp4 *ans, - const int astride, __global acctyp *engv, - __global int *err_flag, const int eflag, - const int vflag, const int inum, +__kernel void k_resquared(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict q, + const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict well, + const __global numtyp *restrict splj, + const __global numtyp2 *restrict sig_eps, + const int ntypes, + const __global int *dev_nbor, + const int stride, + __global acctyp4 *restrict ans, + const int astride, + __global acctyp *restrict engv, + __global int *restrict err_flag, + const int eflag, const int vflag, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -68,7 +74,7 @@ __kernel void k_resquared(__global numtyp4* x_,__global numtyp4 *q, virial[i]=(acctyp)0; if (ii1) { \ + __local acctyp red_acc[7][BLOCK_PAIR]; \ + red_acc[0][tid]=f.x; \ + red_acc[1][tid]=f.y; \ + red_acc[2][tid]=f.z; \ + red_acc[3][tid]=tor.x; \ + red_acc[4][tid]=tor.y; \ + red_acc[5][tid]=tor.z; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + f.x=red_acc[0][tid]; \ + f.y=red_acc[1][tid]; \ + f.z=red_acc[2][tid]; \ + tor.x=red_acc[3][tid]; \ + tor.y=red_acc[4][tid]; \ + tor.z=red_acc[5][tid]; \ + if (eflag>0 || vflag>0) { \ + for (int r=0; r<6; r++) \ + red_acc[r][tid]=virial[r]; \ + red_acc[6][tid]=energy; \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + if (offset < s) { \ + for (int r=0; r<7; r++) \ + red_acc[r][tid] += red_acc[r][tid+s]; \ + } \ + } \ + for (int r=0; r<6; r++) \ + virial[r]=red_acc[r][tid]; \ + energy=red_acc[6][tid]; \ + } \ + } \ + if (offset==0) { \ + __global acctyp *ap1=engv+ii; \ + if (eflag>0) { \ + *ap1+=energy; \ + ap1+=astride; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *ap1+=virial[i]; \ + ap1+=astride; \ + } \ + } \ + acctyp4 old=ans[ii]; \ + old.x+=f.x; \ + old.y+=f.y; \ + old.z+=f.z; \ + ans[ii]=old; \ + old=ans[ii+astride]; \ + old.x+=tor.x; \ + old.y+=tor.y; \ + old.z+=tor.z; \ + ans[ii+astride]=old; \ + } + +#else + +#define store_answers_rt(f, tor, energy, virial, ii, astride, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_xor(f.x, s, t_per_atom); \ + f.y += shfl_xor(f.y, s, t_per_atom); \ + f.z += shfl_xor(f.z, s, t_per_atom); \ + tor.x += shfl_xor(tor.x, s, t_per_atom); \ + tor.y += shfl_xor(tor.y, s, t_per_atom); \ + tor.z += shfl_xor(tor.z, s, t_per_atom); \ + energy += shfl_xor(energy, s, t_per_atom); \ + } \ + if (vflag>0) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_xor(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0) { \ + __global acctyp *ap1=engv+ii; \ + if (eflag>0) { \ + *ap1+=energy; \ + ap1+=astride; \ + } \ + if (vflag>0) { \ + for (int i=0; i<6; i++) { \ + *ap1+=virial[i]; \ + ap1+=astride; \ + } \ + } \ + acctyp4 old=ans[ii]; \ + old.x+=f.x; \ + old.y+=f.y; \ + old.z+=f.z; \ + ans[ii]=old; \ + old=ans[ii+astride]; \ + old.x+=tor.x; \ + old.y+=tor.y; \ + old.z+=tor.z; \ + ans[ii+astride]=old; \ + } + +#endif + +__kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict q, + const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict well, + const __global numtyp *restrict splj, + const __global numtyp2 *restrict sig_eps, + const int ntypes, + const __global int *dev_nbor, + const int stride, + __global acctyp4 *restrict ans, + const int astride, + __global acctyp *restrict engv, + __global int *restrict err_flag, + const int eflag, const int vflag, + const int inum, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -55,7 +172,7 @@ __kernel void k_resquared_ellipsoid_sphere(__global numtyp4* x_, virial[i]=(acctyp)0; if (ii1) { - __local acctyp red_acc[7][BLOCK_PAIR]; - - red_acc[0][tid]=f.x; - red_acc[1][tid]=f.y; - red_acc[2][tid]=f.z; - red_acc[3][tid]=tor.x; - red_acc[4][tid]=tor.y; - red_acc[5][tid]=tor.z; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<6; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - f.x=red_acc[0][tid]; - f.y=red_acc[1][tid]; - f.z=red_acc[2][tid]; - tor.x=red_acc[3][tid]; - tor.y=red_acc[4][tid]; - tor.z=red_acc[5][tid]; - - if (eflag>0 || vflag>0) { - for (int r=0; r<6; r++) - red_acc[r][tid]=virial[r]; - red_acc[6][tid]=energy; - - for (unsigned int s=t_per_atom/2; s>0; s>>=1) { - if (offset < s) { - for (int r=0; r<7; r++) - red_acc[r][tid] += red_acc[r][tid+s]; - } - } - - for (int r=0; r<6; r++) - virial[r]=red_acc[r][tid]; - energy=red_acc[6][tid]; - } - } - - // Store answers - if (offset==0) { - __global acctyp *ap1=engv+ii; - if (eflag>0) { - *ap1+=energy; - ap1+=astride; - } - if (vflag>0) { - for (int i=0; i<6; i++) { - *ap1+=virial[i]; - ap1+=astride; - } - } - acctyp4 old=ans[ii]; - old.x+=f.x; - old.y+=f.y; - old.z+=f.z; - ans[ii]=old; - - old=ans[ii+astride]; - old.x+=tor.x; - old.y+=tor.y; - old.z+=tor.z; - ans[ii+astride]=old; - } + store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, + vflag,ans,engv); } // if ii } -__kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_, - __global numtyp4 *q, __global numtyp4* shape, - __global numtyp4* well, __global numtyp *splj, - __global numtyp2* sig_eps, const int ntypes, - __global int *dev_nbor, const int stride, - __global acctyp4 *ans, __global acctyp *engv, - __global int *err_flag, const int eflag, const int vflag, - const int start, const int inum, const int t_per_atom) { +__kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict q, + const __global numtyp4 *restrict shape, + const __global numtyp4 *restrict well, + const __global numtyp *restrict splj, + const __global numtyp2 *restrict sig_eps, + const int ntypes, + const __global int *dev_nbor, + const int stride, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + __global int *restrict err_flag, + const int eflag, const int vflag, + const int start, const int inum, + const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); ii+=start; @@ -366,7 +423,7 @@ __kernel void k_resquared_sphere_ellipsoid(__global numtyp4 *x_, virial[i]=(acctyp)0; if (ii pos_tex; #define pos_tex x_ #endif -__kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff, - const numtyp kappa, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, - const int nbor_pitch, const int t_per_atom) { +__kernel void k_yukawa(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict coeff, + const numtyp kappa, const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); @@ -48,7 +51,7 @@ __kernel void k_yukawa(__global numtyp4 *x_, __global numtyp4 *coeff, virial[i]=(acctyp)0; if (ii device; template YukawaColloidT::YukawaColloid() : BaseAtomic(), -_allocated(false), _max_rad_size(0) { +_max_rad_size(0), _allocated(false) { } template diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu index 4eb9bc87b2..55f52e70e7 100644 --- a/lib/gpu/lal_yukawa_colloid.cu +++ b/lib/gpu/lal_yukawa_colloid.cu @@ -29,12 +29,16 @@ texture rad_tex; #define rad_tex rad_ #endif -__kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_, - __global numtyp4 *coeff, const int lj_types, - __global numtyp *sp_lj_in, __global int *dev_nbor, - __global int *dev_packed, __global acctyp4 *ans, - __global acctyp *engv, const int eflag, - const int vflag, const int inum, +__kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_, + const __global numtyp *restrict rad_, + const __global numtyp4 *restrict coeff, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, const numtyp kappa) { int tid, ii, offset; @@ -54,7 +58,7 @@ __kernel void k_yukawa_colloid(__global numtyp4 *x_, __global numtyp *rad_, virial[i]=(acctyp)0; if (ii