diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index cb4a3fdbd6..a63d286d9c 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -250,7 +250,8 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { (BX/this->_threads_per_atom))); this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &_cutshortsq, + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -283,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index 9faa59c34d..0026fb97cb 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -165,7 +165,10 @@ texture ts5_tex; #endif __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, - const numtyp cutshortsq, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, __global int * dev_short_nbor, @@ -182,6 +185,8 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; int ncount = 0; int m = nbor; @@ -195,6 +200,9 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delx = ix.x-jx.x; @@ -202,7 +210,7 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; // compute zeta_ij z = (acctyp)0; @@ -391,6 +399,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -426,9 +435,14 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + for ( ; nbor0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -556,7 +569,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -669,7 +682,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, - const __global int * dev_short_nbor, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -738,7 +751,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; @@ -978,7 +991,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index 02000d77d3..c37c07f1a1 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -250,7 +250,8 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { (BX/this->_threads_per_atom))); this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &_cutshortsq, + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -283,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index 75bacc2179..555485a1b2 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -165,7 +165,10 @@ texture ts5_tex; #endif __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, - const numtyp cutshortsq, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, __global int * dev_short_nbor, @@ -182,6 +185,8 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; int ncount = 0; int m = nbor; @@ -195,6 +200,9 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delx = ix.x-jx.x; @@ -202,7 +210,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; // compute zeta_ij z = (acctyp)0; @@ -392,6 +400,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -427,9 +436,14 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + for ( ; nbor0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -560,7 +573,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -748,7 +761,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; @@ -997,7 +1010,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp index 33edabd799..827613067c 100644 --- a/lib/gpu/lal_tersoff_zbl.cpp +++ b/lib/gpu/lal_tersoff_zbl.cpp @@ -275,7 +275,8 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { (BX/this->_threads_per_atom))); this->k_short_nbor.set_size(GX,BX); - this->k_short_nbor.run(&this->atom->x, &_cutshortsq, + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -309,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index 439d4028df..89ae72df8a 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -168,7 +168,10 @@ texture ts6_tex; #endif __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, - const numtyp cutshortsq, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, __global int * dev_short_nbor, @@ -185,6 +188,8 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, n_stride,nbor_end,nbor); numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; int ncount = 0; int m = nbor; @@ -198,6 +203,9 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; // Compute r12 numtyp delx = ix.x-jx.x; @@ -205,7 +213,7 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; // compute zeta_ij z = (acctyp)0; @@ -403,6 +411,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -440,9 +449,14 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + for ( ; nbor0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -576,7 +589,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -758,7 +771,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; @@ -998,7 +1011,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; numtyp mdelr1[3]; mdelr1[0] = -delr1[0];