// ************************************************************************** // aux_fun1.h // ------------------- // W. Michael Brown (ORNL) // // Device code for pair style auxiliary functions // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : Sat Oct 22 2011 // email : brownw@ornl.gov // ***************************************************************************/ #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_preprocessor.h" #endif #define atom_info(t_per_atom, ii, tid, offset) \ tid=THREAD_ID_X; \ offset=tid & (t_per_atom-1); \ ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; #define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \ i, numj, n_stride, nbor_end, nbor_begin) \ i=dev_nbor[ii]; \ nbor_begin=ii+nbor_pitch; \ numj=dev_nbor[nbor_begin]; \ if (dev_nbor==dev_packed) { \ nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \ n_stride=fast_mul(t_per_atom,nbor_pitch); \ nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \ nbor_begin+=offset; \ } else { \ nbor_begin+=nbor_pitch; \ nbor_begin=dev_nbor[nbor_begin]; \ nbor_end=nbor_begin+numj; \ n_stride=t_per_atom; \ nbor_begin+=offset; \ } #define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset, \ i, numj, stride, nbor_end, nbor_begin) \ i=nbor_mem[ii]; \ nbor_begin=ii+nbor_stride; \ numj=nbor_mem[nbor_begin]; \ nbor_begin+=nbor_stride+ii*(t_per_atom-1); \ stride=fast_mul(t_per_atom,nbor_stride); \ nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & \ (t_per_atom-1)); \ nbor_begin+=offset; #if (SHUFFLE_AVAIL == 0) #define simd_reduce_add1(width, local, offset, tid, one) \ local[0][tid]=one; \ for (unsigned int s=width/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) local[0][tid] += local[0][tid+s]; \ } \ if (offset==0) one=local[0][tid]; #define simd_reduce_add2(width, local, offset, tid, one, two) \ local[0][tid]=one; \ local[1][tid]=two; \ for (unsigned int s=width/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) { \ local[0][tid] += local[0][tid+s]; \ local[1][tid] += local[1][tid+s]; \ } \ } \ if (offset==0) { \ one=local[0][tid]; \ two=local[1][tid]; \ } #define simd_reduce_add3(width, local, offset, tid, one, two, three) \ local[0][tid]=one; \ local[1][tid]=two; \ local[2][tid]=three; \ for (unsigned int s=width/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) { \ local[0][tid] += local[0][tid+s]; \ local[1][tid] += local[1][tid+s]; \ local[2][tid] += local[2][tid+s]; \ } \ } \ if (offset==0) { \ one=local[0][tid]; \ two=local[1][tid]; \ three=local[2][tid]; \ } #define simd_reduce_add6(width, local, offset, tid, one, two, three, \ four, five, six) \ local[0][tid]=one; \ local[1][tid]=two; \ local[2][tid]=three; \ local[3][tid]=four; \ local[4][tid]=five; \ local[5][tid]=six; \ for (unsigned int s=width/2; s>0; s>>=1) { \ simdsync(); \ if (offset < s) { \ local[0][tid] += local[0][tid+s]; \ local[1][tid] += local[1][tid+s]; \ local[2][tid] += local[2][tid+s]; \ local[3][tid] += local[3][tid+s]; \ local[4][tid] += local[4][tid+s]; \ local[5][tid] += local[5][tid+s]; \ } \ } \ if (offset==0) { \ one=local[0][tid]; \ two=local[1][tid]; \ three=local[2][tid]; \ four=local[3][tid]; \ five=local[4][tid]; \ six=local[5][tid]; \ } #define simd_reduce_arr(trip, width, local, offset, tid, arr) \ for (int r=0; r0; s>>=1) { \ simdsync(); \ if (offset < s) { \ for (int r=0; rwidth/2; s>>=1) { \ __syncthreads(); \ if (tid < s) local[0][tid] += local[0][tid+s]; \ } \ if (tid0; s>>=1) { \ simdsync(); \ if (tid < s) local[0][tid] += local[0][tid+s]; \ } \ if (tid==0) one=local[0][tid]; \ } #define block_reduce_add2(width, local, tid, one, two) \ local[0][tid]=one; \ local[1][tid]=two; \ for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \ __syncthreads(); \ if (tid < s) { \ local[0][tid] += local[0][tid+s]; \ local[1][tid] += local[1][tid+s]; \ } \ } \ if (tid0; s>>=1) { \ simdsync(); \ if (tid < s) { \ local[0][tid] += local[0][tid+s]; \ local[1][tid] += local[1][tid+s]; \ } \ } \ if (tid==0) { \ one=local[0][tid]; \ two=local[1][tid]; \ } \ } #define block_reduce_arr(trip, width, local, tid, arr) \ for (int r=0; rwidth/2; s>>=1) { \ __syncthreads(); \ if (tid < s) { \ for (int r=0; r0; s>>=1) { \ simdsync(); \ if (tid < s) { \ for (int r=0; r1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ if (EVFLAG && (vflag==2 || eflag==2)) { \ if (eflag) { \ simdsync(); \ simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \ } \ if (vflag) { \ simdsync(); \ simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ } \ } \ if (offset==0 && ii1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ if (EVFLAG && (vflag==2 || eflag==2)) { \ if (eflag) { \ simdsync(); \ simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \ } \ if (vflag) { \ simdsync(); \ simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \ } \ } \ } \ if (offset==0 && ii0; s>>=1) one += shfl_down(one, s, width); #define simd_reduce_add2(width, one, two) \ for (unsigned int s=width/2; s>0; s>>=1) { \ one += shfl_down(one, s, width); \ two += shfl_down(two, s, width); \ } #define simd_reduce_add3(width, one, two, three) \ for (unsigned int s=width/2; s>0; s>>=1) { \ one += shfl_down(one, s, width); \ two += shfl_down(two, s, width); \ three += shfl_down(three, s, width); \ } #define simd_reduce_add6(width, one, two, three, four, five, six) \ for (unsigned int s=width/2; s>0; s>>=1) { \ one += shfl_down(one, s, width); \ two += shfl_down(two, s, width); \ three += shfl_down(three, s, width); \ four += shfl_down(four, s, width); \ five += shfl_down(five, s, width); \ six += shfl_down(six, s, width); \ } #define simd_reduce_arr(trip, width, arr) \ for (unsigned int s=width/2; s>0; s>>=1) { \ for (int r=0; r1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (vflag==2 || eflag==2) { \ if (eflag) \ simd_reduce_add1(t_per_atom,energy); \ if (vflag) \ simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ if (offset==0 && ii 1; active_subgs /= vwidth) { \ if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ if (bnum < active_subgs) { \ if (eflag) { \ simd_reduce_add1(vwidth, energy); \ if (voffset==0) red_acc[6][bnum] = energy; \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (voffset==0) \ for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ } \ } \ \ __syncthreads(); \ if (tid < active_subgs) { \ if (eflag) energy = red_acc[6][tid]; \ if (vflag) \ for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ } else { \ if (eflag) energy = (acctyp)0; \ if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ } \ } \ \ if (bnum == 0) { \ int ei=BLOCK_ID_X; \ const int ev_stride=NUM_BLOCKS_X; \ if (eflag) { \ simd_reduce_add1(vwidth, energy); \ if (tid==0) { \ engv[ei]=energy*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (tid==0) { \ for (int r=0; r<6; r++) { \ engv[ei]=virial[r]*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ } \ } \ } else if (offset==0 && ii1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (vflag==2 || eflag==2) { \ if (eflag) \ simd_reduce_add2(t_per_atom,energy,e_coul); \ if (vflag) \ simd_reduce_arr(6, t_per_atom,virial); \ } \ } \ if (offset==0 && ii 1; active_subgs /= vwidth) { \ if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \ if (bnum < active_subgs) { \ if (eflag) { \ simd_reduce_add2(vwidth, energy, e_coul); \ if (voffset==0) { \ red_acc[6][bnum] = energy; \ red_acc[7][bnum] = e_coul; \ } \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (voffset==0) \ for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \ } \ } \ \ __syncthreads(); \ if (tid < active_subgs) { \ if (eflag) { \ energy = red_acc[6][tid]; \ e_coul = red_acc[7][tid]; \ } \ if (vflag) \ for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \ } else { \ if (eflag) energy = e_coul = (acctyp)0; \ if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \ } \ } \ \ if (bnum == 0) { \ int ei=BLOCK_ID_X; \ const int ev_stride=NUM_BLOCKS_X; \ if (eflag) { \ simd_reduce_add2(vwidth, energy, e_coul); \ if (tid==0) { \ engv[ei]=energy*(acctyp)0.5; \ ei+=ev_stride; \ engv[ei]=e_coul*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ if (vflag) { \ simd_reduce_arr(6, vwidth, virial); \ if (tid==0) { \ for (int r=0; r<6; r++) { \ engv[ei]=virial[r]*(acctyp)0.5; \ ei+=ev_stride; \ } \ } \ } \ } \ } else if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii