Enabled again neigh no with tpa > 1 for 3-body gpu styles for backward compatibility, could be slower than neigh no tpa 1 in many cases

This commit is contained in:
Trung Nguyen
2017-07-22 22:57:37 -05:00
parent 3d1d0c58c7
commit a71f5a0c20
6 changed files with 349 additions and 217 deletions

View File

@ -271,9 +271,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor_j, nbor_end;
int i, numj;
int nbor_j, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -284,14 +283,17 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
nbor_end = nbor_j+fast_mul(numj,n_stride);
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_short_nbor[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -312,11 +314,14 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
z = (acctyp)0;
int nbor_k = nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
int k_end = nbor_k+fast_mul(numk,n_stride);
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
int k=dev_short_nbor[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == j) continue;
@ -356,7 +361,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j - n_stride;
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acc_zeta(z, tid, t_per_atom, offset_k);
@ -427,8 +433,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
__syncthreads();
if (ii<inum) {
int nbor, nbor_end;
int i, numj;
int nbor, nbor_end, i, numj;
const int* nbor_mem=dev_packed;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
n_stride,nbor_end,nbor);
@ -437,13 +443,16 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor];
nbor += n_stride;
nbor_end = nbor+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor<nbor_end; nbor+=n_stride) {
int j=dev_short_nbor[nbor];
int j=nbor_mem[nbor];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -540,7 +549,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -551,14 +560,17 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
itype=map[itype];
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
int nborj_start = nbor_j;
nbor_end = nbor_j+fast_mul(numj,n_stride);
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_short_nbor[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -582,7 +594,8 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
//int idx = jj*n_stride + i*t_per_atom + offset_j;
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
int idx = nbor_j - n_stride;
int idx = nbor_j;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// i, nbor_j, offset_j, idx);
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
@ -606,11 +619,14 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
}
int nbor_k = nborj_start-offset_j+offset_k;
int numk = dev_short_nbor[nbor_k-n_stride];
int k_end = nbor_k+fast_mul(numk,n_stride);
int k_end = nbor_end;
if (dev_packed==dev_nbor) {
int numk = dev_short_nbor[nbor_k-n_stride];
k_end = nbor_k+fast_mul(numk,n_stride);
}
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_short_nbor[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (j == k) continue;
@ -727,7 +743,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem=dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -740,13 +756,16 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_short_nbor[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -786,16 +805,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
}
// recalculate numk and k_end for the use of short neighbor list
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji: find i in the j's neighbor list
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_short_nbor[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -818,7 +839,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum - n_stride;
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
@ -843,7 +865,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_short_nbor[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -888,7 +910,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k - n_stride;
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
@ -976,7 +999,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
if (ii<inum) {
int i, numj, nbor_j, nbor_end, k_end;
const int* nbor_mem = dev_packed;
int offset_j=offset/t_per_atom;
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
n_stride,nbor_end,nbor_j);
@ -989,13 +1012,16 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
// recalculate numj and nbor_end for use of the short nbor list
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
if (dev_packed==dev_nbor) {
numj = dev_short_nbor[nbor_j];
nbor_j += n_stride;
nbor_end = nbor_j+fast_mul(numj,n_stride);
nbor_mem = dev_short_nbor;
}
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
int j=dev_short_nbor[nbor_j];
int j=nbor_mem[nbor_j];
j &= NEIGHMASK;
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -1035,16 +1061,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
}
// recalculate numk and k_end for the use of short neighbor list
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
if (dev_packed==dev_nbor) {
numk = dev_short_nbor[nbor_k];
nbor_k += n_stride;
k_end = nbor_k+fast_mul(numk,n_stride);
}
int nbork_start = nbor_k;
// look up for zeta_ji
int m = tid / t_per_atom;
int ijnum = -1;
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_short_nbor[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) {
ijnum = nbor_k;
@ -1067,7 +1095,8 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
int idx = ijnum - n_stride;
int idx = ijnum;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, ijnum, offset_kf, idx);
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
@ -1092,7 +1121,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
// attractive forces
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
int k=dev_short_nbor[nbor_k];
int k=nbor_mem[nbor_k];
k &= NEIGHMASK;
if (k == i) continue;
@ -1144,7 +1173,8 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
//int idx = kk*n_stride + j*t_per_atom + offset_k;
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
int idx = nbor_k - n_stride;
int idx = nbor_k;
if (dev_packed==dev_nbor) idx -= n_stride;
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
// j, nbor_k, offset_k, idx);
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);