From f803ba56558aec87be6df267f2870b7637aaa5cb Mon Sep 17 00:00:00 2001 From: Vsevak Date: Tue, 12 Nov 2019 21:35:36 +0300 Subject: [PATCH] Add shfl_xor sum to kernel for ARCH>=300 --- lib/gpu/lal_lj_tip4p_long.cu | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu index 7c6cec4473..1ea6de1d41 100644 --- a/lib/gpu/lal_lj_tip4p_long.cu +++ b/lib/gpu/lal_lj_tip4p_long.cu @@ -472,6 +472,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, } // if cut_coulsqplus } // for nbor if (t_per_atom>1) { +#if (ARCH < 300) __local acctyp red_acc[6][BLOCK_PAIR]; red_acc[0][tid]=fO.x; red_acc[1][tid]=fO.y; @@ -497,6 +498,20 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, } for (int r=0; r<6; r++) vO[r]=red_acc[r][tid]; } +#else + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + fO.x += shfl_xor(fO.x, s, t_per_atom); + fO.y += shfl_xor(fO.y, s, t_per_atom); + fO.z += shfl_xor(fO.z, s, t_per_atom); + fO.w += shfl_xor(fO.w, s, t_per_atom); + } + if (vflag>0) { + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { + for (int r=0; r<6; r++) + vO[r] += shfl_xor(vO[r], s, t_per_atom); + } + } +#endif } if(offset == 0) { ansO[i] = fO;