diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 6076c55283..2b38bd02dc 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -308,7 +308,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, } -#define threebody(delr1x, delr1y, delr1z, eflag, energy) \ +#define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy) \ { \ numtyp r1 = ucl_sqrt(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \ @@ -361,7 +361,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, } \ } -#define threebody_half(delr1x, delr1y, delr1z) \ +#define threebody_half(delr1x, delr1y, delr1z, delr2x, delr2y, delr2z) \ { \ numtyp r1 = ucl_sqrt(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \ @@ -511,7 +511,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, sw_costheta_ijk=sw3_ijkparam.z; numtyp fjx, fjy, fjz, fkx, fky, fkz; - threebody(delr1x,delr1y,delr1z,eflag,energy); + threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy); f.x -= fjx + fkx; f.y -= fjy + fky; @@ -665,12 +665,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, sw_costheta_ijk=sw3_ijkparam.z; numtyp fjx, fjy, fjz; - //if (evatom==0) { - threebody_half(delr1x,delr1y,delr1z); - //} else { - // numtyp fkx, fky, fkz; - // threebody(delr1x,delr1y,delr1z,eflag,energy); - //} + threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z); f.x += fjx; f.y += fjy; @@ -819,7 +814,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, sw_costheta_ijk=sw3_ijkparam.z; numtyp fjx, fjy, fjz, fkx, fky, fkz; - threebody(delr1x,delr1y,delr1z,eflag,energy); + threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy); f.x += fjx; f.y += fjy; diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index 182859bdd4..fa6ae67dfc 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -192,7 +192,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in _allocated=true; this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+ - ts4.row_bytes()+cutsq.row_bytes()+ + ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+ map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes(); return 0; } @@ -250,10 +250,9 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { (BX/this->_threads_per_atom))); this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &cutsq, &map, - &elem2param, &_nelements, &_nparams, + this->k_short_nbor.run(&this->atom->x, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, &ainum, + &this->dev_short_nbor, &_cutshortsq, &ainum, &nbor_pitch, &this->_threads_per_atom); // re-allocate zetaij if necessary diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index c26c17969f..ff8625b99d 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -165,13 +165,12 @@ _texture( ts5_tex,int4); #endif __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, - const __global numtyp *restrict cutsq, - const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, __global int * dev_short_nbor, + const double _cutshortsq, const int inum, const int nbor_pitch, const int t_per_atom) { __local int n_stride; @@ -185,8 +184,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - int itype=ix.w; - itype=map[itype]; int ncount = 0; int m = nbor; @@ -200,9 +197,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; - int jtype=jx.w; - jtype=map[jtype]; - int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delx = ix.x-jx.x; @@ -210,7 +204,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq= cutsq[ijparam]) continue; + numtyp feng[2]; numtyp ijparam_lam1 = ts1[ijparam].x; numtyp4 ts2_ijparam = ts2[ijparam]; @@ -578,6 +573,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, delr1[1] = jx.y-ix.y; delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; + if (rsq1 >= cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -725,7 +721,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, for (int i=0; i<6; i++) virial[i]=(acctyp)0; - __local int red_acc[2*BLOCK_PAIR]; + __local int red_acc[BLOCK_PAIR]; __syncthreads(); @@ -759,7 +755,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; jtype=map[jtype]; - int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delr1[3]; @@ -806,21 +801,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; - red_acc[2*m+0] = ijnum; - red_acc[2*m+1] = offset_k; + red_acc[m] = ijnum; break; } } numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); - int offset_kf; - if (ijnum >= 0) { - offset_kf = offset_k; - } else { - ijnum = red_acc[2*m+0]; - offset_kf = red_acc[2*m+1]; - } + if (ijnum < 0) ijnum = red_acc[m]; // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor int idx = ijnum; @@ -863,7 +851,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, delr2[2] = kx.z-jx.z; numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2]; - if (rsq2 > cutsq[jikparam]) continue; numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); numtyp4 ts1_param, ts2_param, ts4_param, ts5_param; @@ -892,6 +879,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor int idx = nbor_k; if (dev_packed==dev_nbor) idx -= n_stride; + acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -971,7 +959,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, for (int i=0; i<6; i++) virial[i]=(acctyp)0; - __local int red_acc[2*BLOCK_PAIR]; + __local int red_acc[BLOCK_PAIR]; __syncthreads(); @@ -1005,7 +993,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; jtype=map[jtype]; - int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delr1[3]; @@ -1052,21 +1039,14 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; - red_acc[2*m+0] = ijnum; - red_acc[2*m+1] = offset_k; + red_acc[m] = ijnum; break; } } numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); - int offset_kf; - if (ijnum >= 0) { - offset_kf = offset_k; - } else { - ijnum = red_acc[2*m+0]; - offset_kf = red_acc[2*m+1]; - } + if (ijnum < 0) ijnum = red_acc[m]; // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor int idx = ijnum; diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp index 92db59679e..052167cabd 100644 --- a/lib/gpu/lal_tersoff_zbl.cpp +++ b/lib/gpu/lal_tersoff_zbl.cpp @@ -275,10 +275,9 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { (BX/this->_threads_per_atom))); this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &cutsq, &map, - &elem2param, &_nelements, &_nparams, + this->k_short_nbor.run(&this->atom->x, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->dev_short_nbor, &ainum, + &this->dev_short_nbor, &_cutshortsq, &ainum, &nbor_pitch, &this->_threads_per_atom); // re-allocate zetaij if necessary diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index b97f9247d0..1a75cb09e7 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -168,13 +168,12 @@ _texture( ts6_tex,int4); #endif __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, - const __global numtyp *restrict cutsq, - const __global int *restrict map, const __global int *restrict elem2param, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, __global int * dev_short_nbor, + const double _cutshortsq, const int inum, const int nbor_pitch, const int t_per_atom) { __local int n_stride; @@ -188,8 +187,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; - int itype=ix.w; - itype=map[itype]; int ncount = 0; int m = nbor; @@ -203,9 +200,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; - int jtype=jx.w; - jtype=map[jtype]; - int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delx = ix.x-jx.x; @@ -213,7 +207,7 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq= cutsq[ijparam]) continue; + numtyp feng[2]; numtyp ijparam_lam1 = ts1[ijparam].x; numtyp4 ts2_ijparam = ts2[ijparam]; @@ -594,6 +589,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, delr1[1] = jx.y-ix.y; delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; + if (rsq1 >= cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -735,7 +731,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, for (int i=0; i<6; i++) virial[i]=(acctyp)0; - __local int red_acc[2*BLOCK_PAIR]; + __local int red_acc[BLOCK_PAIR]; __syncthreads(); @@ -769,7 +765,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; jtype=map[jtype]; - int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delr1[3]; @@ -816,21 +811,14 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; - red_acc[2*m+0] = ijnum; - red_acc[2*m+1] = offset_k; + red_acc[m] = ijnum; break; } } numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); - int offset_kf; - if (ijnum >= 0) { - offset_kf = offset_k; - } else { - ijnum = red_acc[2*m+0]; - offset_kf = red_acc[2*m+1]; - } + if (ijnum < 0) ijnum = red_acc[m]; // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor int idx = ijnum; @@ -873,7 +861,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, delr2[2] = kx.z-jx.z; numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2]; - if (rsq2 > cutsq[jikparam]) continue; numtyp r2 = ucl_sqrt(rsq2); numtyp r2inv = ucl_rsqrt(rsq2); numtyp4 ts1_param, ts2_param, ts4_param; @@ -899,6 +886,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor int idx = nbor_k; if (dev_packed==dev_nbor) idx -= n_stride; + acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -972,7 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, for (int i=0; i<6; i++) virial[i]=(acctyp)0; - __local int red_acc[2*BLOCK_PAIR]; + __local int red_acc[BLOCK_PAIR]; __syncthreads(); @@ -1006,7 +994,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; int jtype=jx.w; jtype=map[jtype]; - int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delr1[3]; @@ -1053,21 +1040,14 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; - red_acc[2*m+0] = ijnum; - red_acc[2*m+1] = offset_k; + red_acc[m] = ijnum; break; } } numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); - int offset_kf; - if (ijnum >= 0) { - offset_kf = offset_k; - } else { - ijnum = red_acc[2*m+0]; - offset_kf = red_acc[2*m+1]; - } + if (ijnum < 0) ijnum = red_acc[m]; // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor int idx = ijnum;