Enabled again neigh no with tpa > 1 for 3-body gpu styles for backward compatibility, could be slower than neigh no tpa 1 in many cases
This commit is contained in:
@ -73,7 +73,6 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||||||
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
|
if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
|
||||||
nbor->packing(true);
|
nbor->packing(true);
|
||||||
_nbor_data=&(nbor->dev_packed);
|
_nbor_data=&(nbor->dev_packed);
|
||||||
_threads_per_atom = 1; // enforce tpa = 1 for now
|
|
||||||
} else // neigh yes or tpa == 1
|
} else // neigh yes or tpa == 1
|
||||||
_nbor_data=&(nbor->dev_nbor);
|
_nbor_data=&(nbor->dev_nbor);
|
||||||
if (_threads_per_atom*_threads_per_atom>device->warp_size())
|
if (_threads_per_atom*_threads_per_atom>device->warp_size())
|
||||||
|
|||||||
@ -167,7 +167,6 @@ __kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
|
|||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
int jtype=jx.w;
|
int jtype=jx.w;
|
||||||
jtype=map[jtype];
|
jtype=map[jtype];
|
||||||
|
|
||||||
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
|
||||||
|
|
||||||
// Compute r12
|
// Compute r12
|
||||||
@ -217,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor, nbor_end;
|
int nbor, nbor_end, i, numj;
|
||||||
int i, numj;
|
const int* nbor_mem = dev_packed;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -227,13 +226,16 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor += n_stride;
|
numj = dev_short_nbor[nbor];
|
||||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
nbor += n_stride;
|
||||||
|
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor];
|
int j=nbor_mem[nbor];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -426,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end;
|
int i, numj, nbor_j, nbor_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -437,14 +439,17 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -466,15 +471,22 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
|
|||||||
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
|
||||||
sw_cut_ij=sw3_ijparam.x;
|
sw_cut_ij=sw3_ijparam.x;
|
||||||
|
|
||||||
int nbor_k=nborj_start-offset_j+offset_k;
|
int nbor_k,k_end;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
if (dev_packed==dev_nbor) {
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k=nborj_start-offset_j+offset_k;
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
} else {
|
||||||
|
nbor_k = nbor_j-offset_j+offset_k;
|
||||||
|
if (nbor_k<=nbor_j) nbor_k += n_stride;
|
||||||
|
k_end = nbor_end;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k <= j) continue;
|
if (dev_packed==dev_nbor && k <= j) continue;
|
||||||
|
|
||||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||||
int ktype=kx.w;
|
int ktype=kx.w;
|
||||||
@ -558,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -569,12 +581,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -614,12 +629,14 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -707,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -718,12 +735,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -763,12 +783,14 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
|
|||||||
@ -271,9 +271,8 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor_j, nbor_end;
|
int nbor_j, nbor_end, i, numj;
|
||||||
int i, numj;
|
const int* nbor_mem=dev_packed;
|
||||||
|
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -284,14 +283,17 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -312,11 +314,14 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||||||
z = (acctyp)0;
|
z = (acctyp)0;
|
||||||
|
|
||||||
int nbor_k = nborj_start-offset_j+offset_k;
|
int nbor_k = nborj_start-offset_j+offset_k;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
int k_end = nbor_end;
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
if (dev_packed==dev_nbor) {
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == j) continue;
|
if (k == j) continue;
|
||||||
@ -353,7 +358,8 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
|
|||||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||||
int idx = nbor_j - n_stride;
|
int idx = nbor_j;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// i, nbor_j, offset_j, idx);
|
// i, nbor_j, offset_j, idx);
|
||||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||||
@ -426,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor, nbor_end;
|
int nbor, nbor_end, i, numj;
|
||||||
int i, numj;
|
const int* nbor_mem=dev_packed;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -436,13 +442,16 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor += n_stride;
|
numj = dev_short_nbor[nbor];
|
||||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
nbor += n_stride;
|
||||||
|
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor];
|
int j=nbor_mem[nbor];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -536,7 +545,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end;
|
int i, numj, nbor_j, nbor_end;
|
||||||
|
const int* nbor_mem=dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -547,14 +556,17 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -578,7 +590,8 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||||
int idx = nbor_j - n_stride;
|
int idx = nbor_j;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// i, nbor_j, offset_j, idx);
|
// i, nbor_j, offset_j, idx);
|
||||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||||
@ -602,11 +615,14 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int nbor_k = nborj_start-offset_j+offset_k;
|
int nbor_k = nborj_start-offset_j+offset_k;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
int k_end = nbor_end;
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
if (dev_packed==dev_nbor) {
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (j == k) continue;
|
if (j == k) continue;
|
||||||
@ -717,7 +733,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem=dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -730,13 +746,16 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -776,16 +795,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
int nbork_start = nbor_k;
|
int nbork_start = nbor_k;
|
||||||
|
|
||||||
// look up for zeta_ji: find i in the j's neighbor list
|
// look up for zeta_ji: find i in the j's neighbor list
|
||||||
int m = tid / t_per_atom;
|
int m = tid / t_per_atom;
|
||||||
int ijnum = -1;
|
int ijnum = -1;
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
if (k == i) {
|
if (k == i) {
|
||||||
ijnum = nbor_k;
|
ijnum = nbor_k;
|
||||||
@ -808,7 +829,8 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||||
int idx = ijnum - n_stride;
|
int idx = ijnum;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, ijnum, offset_kf, idx);
|
// j, ijnum, offset_kf, idx);
|
||||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||||
@ -833,7 +855,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
// attractive forces
|
// attractive forces
|
||||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -875,7 +897,8 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
|
|||||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||||
int idx = nbor_k - n_stride;
|
int idx = nbor_k;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, nbor_k, offset_k, idx);
|
// j, nbor_k, offset_k, idx);
|
||||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||||
@ -957,7 +980,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -970,13 +993,16 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -1016,16 +1042,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
int nbork_start = nbor_k;
|
int nbork_start = nbor_k;
|
||||||
|
|
||||||
// look up for zeta_ji
|
// look up for zeta_ji
|
||||||
int m = tid / t_per_atom;
|
int m = tid / t_per_atom;
|
||||||
int ijnum = -1;
|
int ijnum = -1;
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
if (k == i) {
|
if (k == i) {
|
||||||
ijnum = nbor_k;
|
ijnum = nbor_k;
|
||||||
@ -1048,7 +1076,8 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||||
int idx = ijnum - n_stride;
|
int idx = ijnum;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, ijnum, offset_kf, idx);
|
// j, ijnum, offset_kf, idx);
|
||||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||||
@ -1073,7 +1102,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
// attractive forces
|
// attractive forces
|
||||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -1122,7 +1151,8 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||||
int idx = nbor_k - n_stride;
|
int idx = nbor_k;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, nbor_k, offset_k, idx);
|
// j, nbor_k, offset_k, idx);
|
||||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||||
@ -1152,7 +1182,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
|
virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
|
||||||
virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
|
virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
|
||||||
virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
|
virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
|
||||||
|
|
||||||
}
|
}
|
||||||
} // for nbor
|
} // for nbor
|
||||||
|
|
||||||
|
|||||||
@ -271,9 +271,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor_j, nbor_end;
|
int nbor_j, nbor_end, i, numj;
|
||||||
int i, numj;
|
const int* nbor_mem=dev_packed;
|
||||||
|
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -284,14 +283,17 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -312,11 +314,14 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||||||
z = (acctyp)0;
|
z = (acctyp)0;
|
||||||
|
|
||||||
int nbor_k = nborj_start-offset_j+offset_k;
|
int nbor_k = nborj_start-offset_j+offset_k;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
int k_end = nbor_end;
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
if (dev_packed==dev_nbor) {
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == j) continue;
|
if (k == j) continue;
|
||||||
@ -356,7 +361,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
|
|||||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||||
int idx = nbor_j - n_stride;
|
int idx = nbor_j;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// i, nbor_j, offset_j, idx);
|
// i, nbor_j, offset_j, idx);
|
||||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||||
@ -427,8 +433,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor, nbor_end;
|
int nbor, nbor_end, i, numj;
|
||||||
int i, numj;
|
const int* nbor_mem=dev_packed;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -437,13 +443,16 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor += n_stride;
|
numj = dev_short_nbor[nbor];
|
||||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
nbor += n_stride;
|
||||||
|
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor];
|
int j=nbor_mem[nbor];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -540,7 +549,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end;
|
int i, numj, nbor_j, nbor_end;
|
||||||
|
const int* nbor_mem=dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -551,14 +560,17 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -582,7 +594,8 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||||
int idx = nbor_j - n_stride;
|
int idx = nbor_j;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// i, nbor_j, offset_j, idx);
|
// i, nbor_j, offset_j, idx);
|
||||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||||
@ -606,11 +619,14 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int nbor_k = nborj_start-offset_j+offset_k;
|
int nbor_k = nborj_start-offset_j+offset_k;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
int k_end = nbor_end;
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
if (dev_packed==dev_nbor) {
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (j == k) continue;
|
if (j == k) continue;
|
||||||
@ -727,7 +743,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem=dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -740,13 +756,16 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -786,16 +805,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
int nbork_start = nbor_k;
|
int nbork_start = nbor_k;
|
||||||
|
|
||||||
// look up for zeta_ji: find i in the j's neighbor list
|
// look up for zeta_ji: find i in the j's neighbor list
|
||||||
int m = tid / t_per_atom;
|
int m = tid / t_per_atom;
|
||||||
int ijnum = -1;
|
int ijnum = -1;
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
if (k == i) {
|
if (k == i) {
|
||||||
ijnum = nbor_k;
|
ijnum = nbor_k;
|
||||||
@ -818,7 +839,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||||
int idx = ijnum - n_stride;
|
int idx = ijnum;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, ijnum, offset_kf, idx);
|
// j, ijnum, offset_kf, idx);
|
||||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||||
@ -843,7 +865,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
// attractive forces
|
// attractive forces
|
||||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -888,7 +910,8 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
|
|||||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||||
int idx = nbor_k - n_stride;
|
int idx = nbor_k;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, nbor_k, offset_k, idx);
|
// j, nbor_k, offset_k, idx);
|
||||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||||
@ -976,7 +999,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -989,13 +1012,16 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -1035,16 +1061,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
int nbork_start = nbor_k;
|
int nbork_start = nbor_k;
|
||||||
|
|
||||||
// look up for zeta_ji
|
// look up for zeta_ji
|
||||||
int m = tid / t_per_atom;
|
int m = tid / t_per_atom;
|
||||||
int ijnum = -1;
|
int ijnum = -1;
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
if (k == i) {
|
if (k == i) {
|
||||||
ijnum = nbor_k;
|
ijnum = nbor_k;
|
||||||
@ -1067,7 +1095,8 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||||
int idx = ijnum - n_stride;
|
int idx = ijnum;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, ijnum, offset_kf, idx);
|
// j, ijnum, offset_kf, idx);
|
||||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||||
@ -1092,7 +1121,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
// attractive forces
|
// attractive forces
|
||||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -1144,7 +1173,8 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||||
int idx = nbor_k - n_stride;
|
int idx = nbor_k;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, nbor_k, offset_k, idx);
|
// j, nbor_k, offset_k, idx);
|
||||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||||
|
|||||||
@ -277,9 +277,8 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor_j, nbor_end;
|
int nbor_j, nbor_end, i, numj;
|
||||||
int i, numj;
|
const int* nbor_mem=dev_packed;
|
||||||
|
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -290,14 +289,17 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -318,11 +320,14 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||||||
z = (acctyp)0;
|
z = (acctyp)0;
|
||||||
|
|
||||||
int nbor_k = nborj_start-offset_j+offset_k;
|
int nbor_k = nborj_start-offset_j+offset_k;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
int k_end = nbor_end;
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
if (dev_packed==dev_nbor) {
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k < k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == j) continue;
|
if (k == j) continue;
|
||||||
@ -359,7 +364,8 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
|
|||||||
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
//int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
|
||||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||||
int idx = nbor_j - n_stride;
|
int idx = nbor_j;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// i, nbor_j, offset_j, idx);
|
// i, nbor_j, offset_j, idx);
|
||||||
acc_zeta(z, tid, t_per_atom, offset_k);
|
acc_zeta(z, tid, t_per_atom, offset_k);
|
||||||
@ -440,8 +446,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor, nbor_end;
|
int nbor, nbor_end, i, numj;
|
||||||
int i, numj;
|
const int* nbor_mem=dev_packed;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -450,13 +456,16 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor += n_stride;
|
numj = dev_short_nbor[nbor];
|
||||||
nbor_end = nbor+fast_mul(numj,n_stride);
|
nbor += n_stride;
|
||||||
|
nbor_end = nbor+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor];
|
int j=nbor_mem[nbor];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -556,7 +565,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end;
|
int i, numj, nbor_j, nbor_end;
|
||||||
|
const int* nbor_mem=dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -567,14 +576,17 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -598,7 +610,8 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||||||
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
//int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
|
||||||
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
//int idx = jj*n_stride + i*t_per_atom + offset_j;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
|
||||||
int idx = nbor_j - n_stride;
|
int idx = nbor_j;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// i, nbor_j, offset_j, idx);
|
// i, nbor_j, offset_j, idx);
|
||||||
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
|
||||||
@ -622,11 +635,14 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int nbor_k = nborj_start-offset_j+offset_k;
|
int nbor_k = nborj_start-offset_j+offset_k;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
int k_end = nbor_end;
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
if (dev_packed==dev_nbor) {
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (j == k) continue;
|
if (j == k) continue;
|
||||||
@ -737,7 +753,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem=dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -750,13 +766,16 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -796,16 +815,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
int nbork_start = nbor_k;
|
int nbork_start = nbor_k;
|
||||||
|
|
||||||
// look up for zeta_ji: find i in the j's neighbor list
|
// look up for zeta_ji: find i in the j's neighbor list
|
||||||
int m = tid / t_per_atom;
|
int m = tid / t_per_atom;
|
||||||
int ijnum = -1;
|
int ijnum = -1;
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
if (k == i) {
|
if (k == i) {
|
||||||
ijnum = nbor_k;
|
ijnum = nbor_k;
|
||||||
@ -828,7 +849,8 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||||
int idx = ijnum - n_stride;
|
int idx = ijnum;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, ijnum, offset_kf, idx);
|
// j, ijnum, offset_kf, idx);
|
||||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||||
@ -853,7 +875,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
// attractive forces
|
// attractive forces
|
||||||
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -895,7 +917,8 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
|
|||||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||||
int idx = nbor_k - n_stride;
|
int idx = nbor_k;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, nbor_k, offset_k, idx);
|
// j, nbor_k, offset_k, idx);
|
||||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||||
@ -977,7 +1000,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -990,13 +1013,16 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
numtyp tpainv = ucl_recip((numtyp)t_per_atom);
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
|
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -1036,16 +1062,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
int nbork_start = nbor_k;
|
int nbork_start = nbor_k;
|
||||||
|
|
||||||
// look up for zeta_ji
|
// look up for zeta_ji
|
||||||
int m = tid / t_per_atom;
|
int m = tid / t_per_atom;
|
||||||
int ijnum = -1;
|
int ijnum = -1;
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
if (k == i) {
|
if (k == i) {
|
||||||
ijnum = nbor_k;
|
ijnum = nbor_k;
|
||||||
@ -1068,7 +1096,8 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
//int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
//int idx = iix*n_stride + j*t_per_atom + offset_kf;
|
||||||
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
|
||||||
int idx = ijnum - n_stride;
|
int idx = ijnum;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, ijnum, offset_kf, idx);
|
// j, ijnum, offset_kf, idx);
|
||||||
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
|
||||||
@ -1093,7 +1122,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
// attractive forces
|
// attractive forces
|
||||||
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -1142,7 +1171,8 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
//int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
|
||||||
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
//int idx = kk*n_stride + j*t_per_atom + offset_k;
|
||||||
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
//idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
|
||||||
int idx = nbor_k - n_stride;
|
int idx = nbor_k;
|
||||||
|
if (dev_packed==dev_nbor) idx -= n_stride;
|
||||||
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
|
||||||
// j, nbor_k, offset_k, idx);
|
// j, nbor_k, offset_k, idx);
|
||||||
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
|
||||||
|
|||||||
@ -224,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int nbor, nbor_end;
|
int nbor, nbor_end, i, numj;
|
||||||
int i, numj;
|
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
|
||||||
n_stride,nbor_end,nbor);
|
n_stride,nbor_end,nbor);
|
||||||
|
|
||||||
@ -234,6 +233,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||||
|
|
||||||
int j=dev_packed[nbor];
|
int j=dev_packed[nbor];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
@ -287,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
|
|||||||
f.x+=delx*force;
|
f.x+=delx*force;
|
||||||
f.y+=dely*force;
|
f.y+=dely*force;
|
||||||
f.z+=delz*force;
|
f.z+=delz*force;
|
||||||
|
|
||||||
if (eflag>0)
|
if (eflag>0)
|
||||||
energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
|
energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
|
||||||
|
|
||||||
@ -435,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end;
|
int i, numj, nbor_j, nbor_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -446,13 +447,17 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
int nborj_start = nbor_j;
|
int nborj_start = nbor_j;
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
int j=dev_short_nbor[nbor_j];
|
|
||||||
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -475,15 +480,22 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
|
|||||||
param_gamma_ij=param4_ijparam.y;
|
param_gamma_ij=param4_ijparam.y;
|
||||||
param_r0_ij=param4_ijparam.w;
|
param_r0_ij=param4_ijparam.w;
|
||||||
|
|
||||||
int nbor_k=nborj_start-offset_j+offset_k;
|
int nbor_k,k_end;
|
||||||
int numk = dev_short_nbor[nbor_k-n_stride];
|
if (dev_packed==dev_nbor) {
|
||||||
int k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k=nborj_start-offset_j+offset_k;
|
||||||
|
int numk = dev_short_nbor[nbor_k-n_stride];
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
} else {
|
||||||
|
nbor_k = nbor_j-offset_j+offset_k;
|
||||||
|
if (nbor_k<=nbor_j) nbor_k += n_stride;
|
||||||
|
k_end = nbor_end;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k <= j) continue;
|
if (dev_packed==dev_nbor && k <= j) continue;
|
||||||
|
|
||||||
numtyp4 kx; fetch4(kx,k,pos_tex);
|
numtyp4 kx; fetch4(kx,k,pos_tex);
|
||||||
int ktype=kx.w;
|
int ktype=kx.w;
|
||||||
@ -570,7 +582,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -581,12 +593,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -626,12 +641,14 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
@ -721,7 +738,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
|
|
||||||
if (ii<inum) {
|
if (ii<inum) {
|
||||||
int i, numj, nbor_j, nbor_end, k_end;
|
int i, numj, nbor_j, nbor_end, k_end;
|
||||||
|
const int* nbor_mem = dev_packed;
|
||||||
int offset_j=offset/t_per_atom;
|
int offset_j=offset/t_per_atom;
|
||||||
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
|
||||||
n_stride,nbor_end,nbor_j);
|
n_stride,nbor_end,nbor_j);
|
||||||
@ -732,12 +749,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
itype=map[itype];
|
itype=map[itype];
|
||||||
|
|
||||||
// recalculate numj and nbor_end for use of the short nbor list
|
// recalculate numj and nbor_end for use of the short nbor list
|
||||||
numj = dev_short_nbor[nbor_j];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_j += n_stride;
|
numj = dev_short_nbor[nbor_j];
|
||||||
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
nbor_j += n_stride;
|
||||||
|
nbor_end = nbor_j+fast_mul(numj,n_stride);
|
||||||
|
nbor_mem = dev_short_nbor;
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
|
||||||
int j=dev_short_nbor[nbor_j];
|
int j=nbor_mem[nbor_j];
|
||||||
j &= NEIGHMASK;
|
j &= NEIGHMASK;
|
||||||
|
|
||||||
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
|
||||||
@ -777,12 +797,14 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// recalculate numk and k_end for the use of short neighbor list
|
// recalculate numk and k_end for the use of short neighbor list
|
||||||
numk = dev_short_nbor[nbor_k];
|
if (dev_packed==dev_nbor) {
|
||||||
nbor_k += n_stride;
|
numk = dev_short_nbor[nbor_k];
|
||||||
k_end = nbor_k+fast_mul(numk,n_stride);
|
nbor_k += n_stride;
|
||||||
|
k_end = nbor_k+fast_mul(numk,n_stride);
|
||||||
|
}
|
||||||
|
|
||||||
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
for ( ; nbor_k<k_end; nbor_k+=n_stride) {
|
||||||
int k=dev_short_nbor[nbor_k];
|
int k=nbor_mem[nbor_k];
|
||||||
k &= NEIGHMASK;
|
k &= NEIGHMASK;
|
||||||
|
|
||||||
if (k == i) continue;
|
if (k == i) continue;
|
||||||
|
|||||||
Reference in New Issue
Block a user