From 67a4004f23533fce1af932a7547c51c3cdd296fe Mon Sep 17 00:00:00 2001 From: sjplimp Date: Thu, 7 Apr 2016 21:05:19 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14807 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Nvidia.makefile | 28 +++++++++++++++++ lib/gpu/Opencl.makefile | 23 +++++++++++++- lib/gpu/lal_sw.cpp | 6 ++-- lib/gpu/lal_sw.cu | 2 +- lib/gpu/lal_tersoff.cpp | 2 +- lib/gpu/lal_tersoff.cu | 60 ++++++++++++++++++------------------- lib/gpu/lal_tersoff.h | 2 +- lib/gpu/lal_tersoff_extra.h | 8 ++--- 8 files changed, 91 insertions(+), 40 deletions(-) diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 004b387649..18efbda55a 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -70,6 +70,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \ $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \ $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \ $(OBJ_DIR)/lal_tersoff.o $(OBJ_DIR)/lal_tersoff_ext.o \ + $(OBJ_DIR)/lal_tersoff_zbl.o $(OBJ_DIR)/lal_tersoff_zbl_ext.o \ + $(OBJ_DIR)/lal_tersoff_mod.o $(OBJ_DIR)/lal_tersoff_mod_ext.o \ $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \ $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o \ $(OBJ_DIR)/lal_zbl.o $(OBJ_DIR)/lal_zbl_ext.o \ @@ -122,6 +124,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \ $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \ $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h \ $(OBJ_DIR)/tersoff.cubin $(OBJ_DIR)/tersoff_cubin.h \ + $(OBJ_DIR)/tersoff_zbl.cubin $(OBJ_DIR)/tersoff_zbl_cubin.h \ + $(OBJ_DIR)/tersoff_mod.cubin $(OBJ_DIR)/tersoff_mod_cubin.h \ $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul_cubin.h \ $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye_cubin.h \ $(OBJ_DIR)/zbl.cubin $(OBJ_DIR)/zbl_cubin.h \ @@ -705,6 +709,30 @@ $(OBJ_DIR)/lal_tersoff.o: $(ALL_H) lal_tersoff.h lal_tersoff.cpp $(OBJ_DIR)/ters $(OBJ_DIR)/lal_tersoff_ext.o: $(ALL_H) lal_tersoff.h lal_tersoff_ext.cpp lal_base_three.h $(CUDR) -o $@ -c lal_tersoff_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/tersoff_zbl.cubin: lal_tersoff_zbl.cu lal_precision.h lal_tersoff_zbl_extra.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_tersoff_zbl.cu + +$(OBJ_DIR)/tersoff_zbl_cubin.h: $(OBJ_DIR)/tersoff_zbl.cubin $(OBJ_DIR)/tersoff_zbl.cubin + $(BIN2C) -c -n tersoff_zbl $(OBJ_DIR)/tersoff_zbl.cubin > $(OBJ_DIR)/tersoff_zbl_cubin.h + +$(OBJ_DIR)/lal_tersoff_zbl.o: $(ALL_H) lal_tersoff_zbl.h lal_tersoff_zbl.cpp $(OBJ_DIR)/tersoff_zbl_cubin.h $(OBJ_DIR)/lal_base_three.o + $(CUDR) -o $@ -c lal_tersoff_zbl.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_tersoff_zbl_ext.o: $(ALL_H) lal_tersoff_zbl.h lal_tersoff_zbl_ext.cpp lal_base_three.h + $(CUDR) -o $@ -c lal_tersoff_zbl_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/tersoff_mod.cubin: lal_tersoff_mod.cu lal_precision.h lal_tersoff_mod_extra.h lal_preprocessor.h + $(CUDA) --cubin -DNV_KERNEL -o $@ lal_tersoff_mod.cu + +$(OBJ_DIR)/tersoff_mod_cubin.h: $(OBJ_DIR)/tersoff_mod.cubin $(OBJ_DIR)/tersoff_mod.cubin + $(BIN2C) -c -n tersoff_mod $(OBJ_DIR)/tersoff_mod.cubin > $(OBJ_DIR)/tersoff_mod_cubin.h + +$(OBJ_DIR)/lal_tersoff_mod.o: $(ALL_H) lal_tersoff_mod.h lal_tersoff_mod.cpp $(OBJ_DIR)/tersoff_mod_cubin.h $(OBJ_DIR)/lal_base_three.o + $(CUDR) -o $@ -c lal_tersoff_mod.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_tersoff_mod_ext.o: $(ALL_H) lal_tersoff_mod.h lal_tersoff_mod_ext.cpp lal_base_three.h + $(CUDR) -o $@ -c lal_tersoff_mod_ext.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/coul.cubin: lal_coul.cu lal_precision.h lal_preprocessor.h $(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul.cu diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile index d7bae0f494..b33a392242 100644 --- a/lib/gpu/Opencl.makefile +++ b/lib/gpu/Opencl.makefile @@ -59,6 +59,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \ $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \ $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \ $(OBJ_DIR)/lal_tersoff.o $(OBJ_DIR)/lal_tersoff_ext.o \ + $(OBJ_DIR)/lal_tersoff_zbl.o $(OBJ_DIR)/lal_tersoff_zbl_ext.o \ + $(OBJ_DIR)/lal_tersoff_mod.o $(OBJ_DIR)/lal_tersoff_mod_ext.o \ $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \ $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o \ $(OBJ_DIR)/lal_zbl.o $(OBJ_DIR)/lal_zbl_ext.o \ @@ -87,7 +89,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \ $(OBJ_DIR)/soft_cl.h $(OBJ_DIR)/lj_coul_msm_cl.h \ $(OBJ_DIR)/lj_gromacs_cl.h $(OBJ_DIR)/dpd_cl.h \ $(OBJ_DIR)/lj_gauss_cl.h $(OBJ_DIR)/dzugutov_cl.h \ - $(OBJ_DIR)/tersoff_cl.h $(OBJ_DIR)/coul_cl.h \ + $(OBJ_DIR)/tersoff_cl.h $(OBJ_DIR)/tersoff_zbl_cl.h \ + $(OBJ_DIR)/tersoff_mod_cl.h $(OBJ_DIR)/coul_cl.h \ $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/zbl_cl.h \ $(OBJ_DIR)/lj_cubic_cl.h @@ -510,6 +513,24 @@ $(OBJ_DIR)/lal_tersoff.o: $(ALL_H) lal_tersoff.h lal_tersoff.cpp $(OBJ_DIR)/ter $(OBJ_DIR)/lal_tersoff_ext.o: $(ALL_H) lal_tersoff.h lal_tersoff_ext.cpp lal_base_three.h $(OCL) -o $@ -c lal_tersoff_ext.cpp -I$(OBJ_DIR) +$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu lal_tersoff_zbl_extra.h $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h; + +$(OBJ_DIR)/lal_tersoff_zbl.o: $(ALL_H) lal_tersoff_zbl.h lal_tersoff_zbl.cpp $(OBJ_DIR)/tersoff_zbl_cl.h $(OBJ_DIR)/tersoff_zbl_cl.h $(OBJ_DIR)/lal_base_three.o + $(OCL) -o $@ -c lal_tersoff_zbl.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_tersoff_zbl_ext.o: $(ALL_H) lal_tersoff_zbl.h lal_tersoff_zbl_ext.cpp lal_base_three.h + $(OCL) -o $@ -c lal_tersoff_zbl_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu lal_tersoff_mod_extra.h $(PRE1_H) + $(BSH) ./geryon/file_to_cstr.sh tersoff_mod $(PRE1_H) lal_tersoff_mod_extra.h lal_tersoff_mod.cu $(OBJ_DIR)/tersoff_mod_cl.h; + +$(OBJ_DIR)/lal_tersoff_mod.o: $(ALL_H) lal_tersoff_mod.h lal_tersoff_mod.cpp $(OBJ_DIR)/tersoff_mod_cl.h $(OBJ_DIR)/tersoff_mod_cl.h $(OBJ_DIR)/lal_base_three.o + $(OCL) -o $@ -c lal_tersoff_mod.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_tersoff_mod_ext.o: $(ALL_H) lal_tersoff_mod.h lal_tersoff_mod_ext.cpp lal_base_three.h + $(OCL) -o $@ -c lal_tersoff_mod_ext.cpp -I$(OBJ_DIR) + $(OBJ_DIR)/coul_cl.h: lal_coul.cu $(PRE1_H) $(BSH) ./geryon/file_to_cstr.sh coul $(PRE1_H) lal_coul.cu $(OBJ_DIR)/coul_cl.h; diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index f14b0a3438..1f68616b0e 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -142,7 +142,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_ ucl_copy(elem2param,dview_elem2param,false); UCL_H_Vec dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY); - for (int i = 0; i < lj_types; i++) + for (int i = 0; i < ntypes; i++) dview_map[i] = host_map[i]; map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY); @@ -196,13 +196,15 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); + // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa == 1 + // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 int ainum=this->ans->inum(); int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 4492e5f60a..1e358fb6f7 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -35,7 +35,7 @@ texture sw3_tex; #define sw3_tex sw3 #endif -#define THIRD (numtyp)0.66666667 +#define THIRD (numtyp)0.66666666666666666667 //#define THREE_CONCURRENT diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index fc7ebc4f08..bc89c53765 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -178,7 +178,7 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int ucl_copy(elem2param,dview_elem2param,false); UCL_H_Vec dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY); - for (int i = 0; i < lj_types; i++) + for (int i = 0; i < ntypes; i++) dview_map[i] = host_map[i]; map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY); diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index 50202c0ee7..e98a454f58 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -43,7 +43,7 @@ texture ts5_tex; //#define THREE_CONCURRENT -#define THIRD (numtyp)0.66666667 +#define TWOTHIRD (numtyp)0.66666666666666666667 #define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom, \ i, nbor_j, offset_j, idx) \ @@ -108,6 +108,7 @@ texture ts5_tex; #define store_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ + __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ if (offset < s) { \ @@ -180,7 +181,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, - __global numtyp4 * zetaij, + __global acctyp4 * zetaij, const __global int * dev_nbor, const __global int * dev_packed, const int eflag, const int nall, const int inum, @@ -205,9 +206,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, ts5[tid]=ts5_in[tid]; } - numtyp z = (numtyp)0; - __local numtyp red_acc[BLOCK_PAIR]; - if (tid cutsq[ijparam]) continue; // compute zeta_ij - z = (numtyp)0; + z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { @@ -310,7 +309,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, force_zeta(ijparam_bigb, ijparam_bigr, ijparam_bigd, ijparam_lam2, ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2, ijparam_c3, ijparam_c4, rsq1, z, eflag, fpfeng); - numtyp4 zij; + acctyp4 zij; zij.x = fpfeng[0]; zij.y = fpfeng[1]; zij.z = fpfeng[2]; @@ -426,7 +425,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, - const __global numtyp4 *restrict zetaij, + const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, __global acctyp4 *restrict ans, @@ -501,7 +500,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, int idx; zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, i, nbor_j, offset_j, idx); - numtyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); + acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; f.x += delr1[0]*force; @@ -568,7 +567,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, if (vflag>0) { acctyp v[6]; numtyp pre = (numtyp)2.0; - if (evatom==1) pre = THIRD; + if (evatom==1) pre = TWOTHIRD; v[0] = pre*(delr1[0]*fj[0] + delr2[0]*fk[0]); v[1] = pre*(delr1[1]*fj[1] + delr2[1]*fk[1]); v[2] = pre*(delr1[2]*fj[2] + delr2[2]*fk[2]); @@ -595,7 +594,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, - const __global numtyp4 *restrict zetaij, + const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, __global acctyp4 *restrict ans, @@ -710,7 +709,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, int idx; zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, j, ijnum, offset_kf, idx); - numtyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); + acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; numtyp prefactor_ji = zeta_ji.y; f.x += delr1[0]*force; @@ -776,7 +775,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, int idx; zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, j, nbor_k, offset_k, idx); - numtyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); + acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex); @@ -816,7 +815,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, - const __global numtyp4 *restrict zetaij, + const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, __global acctyp4 *restrict ans, @@ -931,9 +930,9 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, int idx; zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, j, ijnum, offset_kf, idx); - numtyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); + acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; - numtyp prefactor = zeta_ji.y; + numtyp prefactor_ji = zeta_ji.y; f.x += delr1[0]*force; f.y += delr1[1]*force; f.z += delr1[2]*force; @@ -987,24 +986,24 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, h = ts4_param.z; gamma = ts4_param.w; attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, - prefactor, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj, fk); + prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj, fk); f.x += fj[0]; f.y += fj[1]; f.z += fj[2]; - virial[0] += THIRD*(mdelr1[0]*fj[0] + delr2[0]*fk[0]); - virial[1] += THIRD*(mdelr1[1]*fj[1] + delr2[1]*fk[1]); - virial[2] += THIRD*(mdelr1[2]*fj[2] + delr2[2]*fk[2]); - virial[3] += THIRD*(mdelr1[0]*fj[1] + delr2[0]*fk[1]); - virial[4] += THIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]); - virial[5] += THIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]); + virial[0] += TWOTHIRD*(mdelr1[0]*fj[0] + delr2[0]*fk[0]); + virial[1] += TWOTHIRD*(mdelr1[1]*fj[1] + delr2[1]*fk[1]); + virial[2] += TWOTHIRD*(mdelr1[2]*fj[2] + delr2[2]*fk[2]); + virial[3] += TWOTHIRD*(mdelr1[0]*fj[1] + delr2[0]*fk[1]); + virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]); + virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]); //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; //int idx = kk*n_stride + j*t_per_atom + offset_k; int idx; zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, j, nbor_k, offset_k, idx); - numtyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); + acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -1025,12 +1024,13 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, f.y += fk[1]; f.z += fk[2]; - virial[0] += THIRD*(delr2[0]*fj[0] + mdelr1[0]*fk[0]); - virial[1] += THIRD*(delr2[1]*fj[1] + mdelr1[1]*fk[1]); - virial[2] += THIRD*(delr2[2]*fj[2] + mdelr1[2]*fk[2]); - virial[3] += THIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]); - virial[4] += THIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]); - virial[5] += THIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]); + virial[0] += TWOTHIRD*(delr2[0]*fj[0] + mdelr1[0]*fk[0]); + virial[1] += TWOTHIRD*(delr2[1]*fj[1] + mdelr1[1]*fk[1]); + virial[2] += TWOTHIRD*(delr2[2]*fj[2] + mdelr1[2]*fk[2]); + virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]); + virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]); + virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]); + } } // for nbor diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h index beae6f5e08..c72ebd7286 100644 --- a/lib/gpu/lal_tersoff.h +++ b/lib/gpu/lal_tersoff.h @@ -100,7 +100,7 @@ class Tersoff : public BaseThree { /// Per-atom arrays: /// zetaij.x = force, zetaij.y = prefactor, zetaij.z = evdwl, /// zetaij.w = zetaij - UCL_D_Vec _zetaij; + UCL_D_Vec _zetaij; UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h index 672a767783..21a0315f71 100644 --- a/lib/gpu/lal_tersoff_extra.h +++ b/lib/gpu/lal_tersoff_extra.h @@ -227,7 +227,7 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor, if ((int)param_powermint == 3) tmp = t*t*t; else tmp = t; - if (tmp > (numtyp)69.0776) ex_delr = (acctyp)1.e30; + if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); @@ -295,7 +295,7 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor, if ((int)param_powermint == 3) tmp = t*t*t; else tmp = t; - if (tmp > (numtyp)69.0776) ex_delr = (acctyp)1.e30; + if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); @@ -344,7 +344,7 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor, if ((int)param_powermint == 3) tmp = t*t*t; else tmp = t; - if (tmp > (numtyp)69.0776) ex_delr = (acctyp)1.e30; + if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp); @@ -391,7 +391,7 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor, if ((int)param_powermint == 3) tmp = t*t*t; else tmp = t; - if (tmp > (numtyp)69.0776) ex_delr = (acctyp)1.e30; + if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30; else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0; else ex_delr = ucl_exp(tmp);