Similar bug fixes for tersoff/mod/gpu and tersoff/zbl/gpu; some cleanup for sw/gpu
This commit is contained in:
@ -308,7 +308,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
|
||||
}
|
||||
|
||||
#define threebody(delr1x, delr1y, delr1z, eflag, energy) \
|
||||
#define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy) \
|
||||
{ \
|
||||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
@ -361,7 +361,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
||||
} \
|
||||
}
|
||||
|
||||
#define threebody_half(delr1x, delr1y, delr1z) \
|
||||
#define threebody_half(delr1x, delr1y, delr1z, delr2x, delr2y, delr2z) \
|
||||
{ \
|
||||
numtyp r1 = ucl_sqrt(rsq1); \
|
||||
numtyp rinvsq1 = ucl_recip(rsq1); \
|
||||
@ -511,7 +511,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
||||
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||
threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
|
||||
|
||||
f.x -= fjx + fkx;
|
||||
f.y -= fjy + fky;
|
||||
@ -665,12 +665,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz;
|
||||
//if (evatom==0) {
|
||||
threebody_half(delr1x,delr1y,delr1z);
|
||||
//} else {
|
||||
// numtyp fkx, fky, fkz;
|
||||
// threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||
//}
|
||||
threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
|
||||
|
||||
f.x += fjx;
|
||||
f.y += fjy;
|
||||
@ -819,7 +814,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
sw_costheta_ijk=sw3_ijkparam.z;
|
||||
|
||||
numtyp fjx, fjy, fjz, fkx, fky, fkz;
|
||||
threebody(delr1x,delr1y,delr1z,eflag,energy);
|
||||
threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
|
||||
|
||||
f.x += fjx;
|
||||
f.y += fjy;
|
||||
|
||||
@ -192,7 +192,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
|
||||
ts4.row_bytes()+cutsq.row_bytes()+
|
||||
ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
|
||||
map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
@ -250,10 +250,9 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
this->k_short_nbor.run(&this->atom->x, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&this->dev_short_nbor, &_cutshortsq, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
|
||||
@ -165,13 +165,12 @@ _texture( ts5_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const double _cutshortsq,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
@ -185,8 +184,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
@ -200,9 +197,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -210,7 +204,7 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
if (rsq<_cutshortsq) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
@ -461,7 +455,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
// rsq<cutsq[ijparam]
|
||||
if (rsq >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
@ -578,6 +573,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[1] = jx.y-ix.y;
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
if (rsq1 >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
@ -725,7 +721,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int red_acc[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -759,7 +755,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -806,21 +801,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
red_acc[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = red_acc[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
@ -863,7 +851,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
if (rsq2 > cutsq[jikparam]) continue;
|
||||
numtyp r2 = ucl_sqrt(rsq2);
|
||||
numtyp r2inv = ucl_rsqrt(rsq2);
|
||||
numtyp4 ts1_param, ts2_param, ts4_param, ts5_param;
|
||||
@ -892,6 +879,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
@ -971,7 +959,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int red_acc[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -1005,7 +993,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -1052,21 +1039,14 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
red_acc[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = red_acc[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
|
||||
@ -275,10 +275,9 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
this->k_short_nbor.set_size(GX,BX);
|
||||
this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
|
||||
&elem2param, &_nelements, &_nparams,
|
||||
this->k_short_nbor.run(&this->atom->x, &elem2param, &_nelements, &_nparams,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&this->dev_short_nbor, &ainum,
|
||||
&this->dev_short_nbor, &_cutshortsq, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom);
|
||||
|
||||
// re-allocate zetaij if necessary
|
||||
|
||||
@ -168,13 +168,12 @@ _texture( ts6_tex,int4);
|
||||
#endif
|
||||
|
||||
__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict cutsq,
|
||||
const __global int *restrict map,
|
||||
const __global int *restrict elem2param,
|
||||
const int nelements, const int nparams,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
__global int * dev_short_nbor,
|
||||
const double _cutshortsq,
|
||||
const int inum, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
__local int n_stride;
|
||||
@ -188,8 +187,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
n_stride,nbor_end,nbor);
|
||||
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype=ix.w;
|
||||
itype=map[itype];
|
||||
|
||||
int ncount = 0;
|
||||
int m = nbor;
|
||||
@ -203,9 +200,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
@ -213,7 +207,7 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (rsq<cutsq[ijparam]) {
|
||||
if (rsq<_cutshortsq) {
|
||||
dev_short_nbor[nbor_short] = nj;
|
||||
nbor_short += n_stride;
|
||||
ncount++;
|
||||
@ -474,7 +468,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp rsq = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
// rsq<cutsq[ijparam]
|
||||
if (rsq >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp feng[2];
|
||||
numtyp ijparam_lam1 = ts1[ijparam].x;
|
||||
numtyp4 ts2_ijparam = ts2[ijparam];
|
||||
@ -594,6 +589,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
||||
delr1[1] = jx.y-ix.y;
|
||||
delr1[2] = jx.z-ix.z;
|
||||
numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
|
||||
if (rsq1 >= cutsq[ijparam]) continue;
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
@ -735,7 +731,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int red_acc[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -769,7 +765,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -816,21 +811,14 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
red_acc[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = red_acc[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
@ -873,7 +861,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
delr2[2] = kx.z-jx.z;
|
||||
numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
|
||||
|
||||
if (rsq2 > cutsq[jikparam]) continue;
|
||||
numtyp r2 = ucl_sqrt(rsq2);
|
||||
numtyp r2inv = ucl_rsqrt(rsq2);
|
||||
numtyp4 ts1_param, ts2_param, ts4_param;
|
||||
@ -899,6 +886,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
||||
// idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||
int idx = nbor_k;
|
||||
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||
|
||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||
numtyp prefactor_jk = zeta_jk.y;
|
||||
int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
|
||||
@ -972,7 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__local int red_acc[2*BLOCK_PAIR];
|
||||
__local int red_acc[BLOCK_PAIR];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@ -1006,7 +994,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||
int jtype=jx.w;
|
||||
jtype=map[jtype];
|
||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||
|
||||
// Compute r12
|
||||
numtyp delr1[3];
|
||||
@ -1053,21 +1040,14 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
||||
k &= NEIGHMASK;
|
||||
if (k == i) {
|
||||
ijnum = nbor_k;
|
||||
red_acc[2*m+0] = ijnum;
|
||||
red_acc[2*m+1] = offset_k;
|
||||
red_acc[m] = ijnum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
numtyp r1 = ucl_sqrt(rsq1);
|
||||
numtyp r1inv = ucl_rsqrt(rsq1);
|
||||
int offset_kf;
|
||||
if (ijnum >= 0) {
|
||||
offset_kf = offset_k;
|
||||
} else {
|
||||
ijnum = red_acc[2*m+0];
|
||||
offset_kf = red_acc[2*m+1];
|
||||
}
|
||||
if (ijnum < 0) ijnum = red_acc[m];
|
||||
|
||||
// idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||
int idx = ijnum;
|
||||
|
||||
Reference in New Issue
Block a user