Add shfl_xor sum to kernel for ARCH>=300
This commit is contained in:
@ -472,6 +472,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
|||||||
} // if cut_coulsqplus
|
} // if cut_coulsqplus
|
||||||
} // for nbor
|
} // for nbor
|
||||||
if (t_per_atom>1) {
|
if (t_per_atom>1) {
|
||||||
|
#if (ARCH < 300)
|
||||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||||
red_acc[0][tid]=fO.x;
|
red_acc[0][tid]=fO.x;
|
||||||
red_acc[1][tid]=fO.y;
|
red_acc[1][tid]=fO.y;
|
||||||
@ -497,6 +498,20 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
|||||||
}
|
}
|
||||||
for (int r=0; r<6; r++) vO[r]=red_acc[r][tid];
|
for (int r=0; r<6; r++) vO[r]=red_acc[r][tid];
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
fO.x += shfl_xor(fO.x, s, t_per_atom);
|
||||||
|
fO.y += shfl_xor(fO.y, s, t_per_atom);
|
||||||
|
fO.z += shfl_xor(fO.z, s, t_per_atom);
|
||||||
|
fO.w += shfl_xor(fO.w, s, t_per_atom);
|
||||||
|
}
|
||||||
|
if (vflag>0) {
|
||||||
|
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||||
|
for (int r=0; r<6; r++)
|
||||||
|
vO[r] += shfl_xor(vO[r], s, t_per_atom);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
if(offset == 0) {
|
if(offset == 0) {
|
||||||
ansO[i] = fO;
|
ansO[i] = fO;
|
||||||
|
|||||||
Reference in New Issue
Block a user