Files
lammps/lib/gpu/lal_aux_fun1.h
2021-02-15 08:20:50 -08:00

561 lines
38 KiB
C

// **************************************************************************
// aux_fun1.h
// -------------------
// W. Michael Brown (ORNL)
//
// Device code for pair style auxiliary functions
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin : Sat Oct 22 2011
// email : brownw@ornl.gov
// ***************************************************************************/
#if defined(NV_KERNEL) || defined(USE_HIP)
#include "lal_preprocessor.h"
#endif
#define atom_info(t_per_atom, ii, tid, offset) \
tid=THREAD_ID_X; \
offset=tid & (t_per_atom-1); \
ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \
i, numj, n_stride, nbor_end, nbor_begin) \
i=dev_nbor[ii]; \
nbor_begin=ii+nbor_pitch; \
numj=dev_nbor[nbor_begin]; \
if (dev_nbor==dev_packed) { \
nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \
n_stride=fast_mul(t_per_atom,nbor_pitch); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
nbor_begin+=offset; \
} else { \
nbor_begin+=nbor_pitch; \
nbor_begin=dev_nbor[nbor_begin]; \
nbor_end=nbor_begin+numj; \
n_stride=t_per_atom; \
nbor_begin+=offset; \
}
#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
nbor_begin+=nbor_stride+ii*(t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & \
(t_per_atom-1)); \
nbor_begin+=offset;
#if (SHUFFLE_AVAIL == 0)
#define simd_reduce_add1(width, local, offset, tid, one) \
local[0][tid]=one; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) local[0][tid] += local[0][tid+s]; \
} \
if (offset==0) one=local[0][tid];
#define simd_reduce_add2(width, local, offset, tid, one, two) \
local[0][tid]=one; \
local[1][tid]=two; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
}
#define simd_reduce_add3(width, local, offset, tid, one, two, three) \
local[0][tid]=one; \
local[1][tid]=two; \
local[2][tid]=three; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
local[2][tid] += local[2][tid+s]; \
} \
} \
if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
three=local[2][tid]; \
}
#define simd_reduce_add6(width, local, offset, tid, one, two, three, \
four, five, six) \
local[0][tid]=one; \
local[1][tid]=two; \
local[2][tid]=three; \
local[3][tid]=four; \
local[4][tid]=five; \
local[5][tid]=six; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
local[2][tid] += local[2][tid+s]; \
local[3][tid] += local[3][tid+s]; \
local[4][tid] += local[4][tid+s]; \
local[5][tid] += local[5][tid+s]; \
} \
} \
if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
three=local[2][tid]; \
four=local[3][tid]; \
five=local[4][tid]; \
six=local[5][tid]; \
}
#define simd_reduce_arr(trip, width, local, offset, tid, arr) \
for (int r=0; r<trip; r++) \
local[r][tid]=arr[r]; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (offset==0) { \
for (int r=0; r<trip; r++) \
arr[r]=local[r][tid]; \
}
#define block_reduce_add1(width, local, tid, one) \
local[0][tid]=one; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) local[0][tid] += local[0][tid+s]; \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) local[0][tid] += local[0][tid+s]; \
} \
if (tid==0) one=local[0][tid]; \
}
#define block_reduce_add2(width, local, tid, one, two) \
local[0][tid]=one; \
local[1][tid]=two; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (tid==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
} \
}
#define block_reduce_arr(trip, width, local, tid, arr) \
for (int r=0; r<trip; r++) \
local[r][tid]=arr[r]; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (tid==0) { \
for (int r=0; r<trip; r++) \
arr[r]=local[r][tid]; \
} \
}
#define local_allocate_store_pair() \
__local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_charge() \
__local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_bio() \
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
#define local_allocate_store_ellipse() \
__local acctyp red_acc[6][BLOCK_ELLIPSE];
#define local_allocate_store_three() \
__local acctyp red_acc[6][BLOCK_ELLIPSE];
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
if (EVFLAG && (vflag==2 || eflag==2)) { \
if (eflag) { \
simdsync(); \
simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \
} \
if (vflag) { \
simdsync(); \
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
} \
} \
} \
if (offset==0 && ii<inum) ans[ii]=f; \
if (EVFLAG && (eflag || vflag)) { \
int ei=BLOCK_ID_X; \
if (eflag!=2 && vflag!=2) { \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simdsync(); \
block_reduce_add1(simd_size(), red_acc, tid, energy); \
if (vflag) __syncthreads(); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simdsync(); \
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (EVFLAG && eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (EVFLAG && vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
}
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
if (EVFLAG && (vflag==2 || eflag==2)) { \
if (eflag) { \
simdsync(); \
simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
} \
if (vflag) { \
simdsync(); \
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
} \
} \
} \
if (offset==0 && ii<inum) ans[ii]=f; \
if (EVFLAG && (eflag || vflag)) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag!=2 && vflag!=2) { \
if (eflag) { \
simdsync(); \
block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \
if (vflag) __syncthreads(); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simdsync(); \
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (EVFLAG && eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (EVFLAG && vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
}
#else
#define simd_reduce_add1(width, one) \
for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
#define simd_reduce_add2(width, one, two) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
}
#define simd_reduce_add3(width, one, two, three) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
three += shfl_down(three, s, width); \
}
#define simd_reduce_add6(width, one, two, three, four, five, six) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
three += shfl_down(three, s, width); \
four += shfl_down(four, s, width); \
five += shfl_down(five, s, width); \
six += shfl_down(six, s, width); \
}
#define simd_reduce_arr(trip, width, arr) \
for (unsigned int s=width/2; s>0; s>>=1) { \
for (int r=0; r<trip; r++) \
arr[r] += shfl_down(arr[r], s, width); \
}
#if (EVFLAG == 1)
#define local_allocate_store_pair() \
__local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_charge() \
__local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_bio() \
__local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
#define local_allocate_store_ellipse()
#define local_allocate_store_three() \
__local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (vflag==2 || eflag==2) { \
if (eflag) \
simd_reduce_add1(t_per_atom,energy); \
if (vflag) \
simd_reduce_arr(6, t_per_atom,virial); \
} \
} \
if (offset==0 && ii<inum) ans[ii]=f; \
if (eflag || vflag) { \
if (eflag!=2 && vflag!=2) { \
const int vwidth = simd_size(); \
const int voffset = tid & (simd_size() - 1); \
const int bnum = tid/simd_size(); \
int active_subgs = BLOCK_SIZE_X/simd_size(); \
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
if (bnum < active_subgs) { \
if (eflag) { \
simd_reduce_add1(vwidth, energy); \
if (voffset==0) red_acc[6][bnum] = energy; \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (voffset==0) \
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
} \
} \
\
__syncthreads(); \
if (tid < active_subgs) { \
if (eflag) energy = red_acc[6][tid]; \
if (vflag) \
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
} else { \
if (eflag) energy = (acctyp)0; \
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
} \
} \
\
if (bnum == 0) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simd_reduce_add1(vwidth, energy); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
}
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (vflag==2 || eflag==2) { \
if (eflag) \
simd_reduce_add2(t_per_atom,energy,e_coul); \
if (vflag) \
simd_reduce_arr(6, t_per_atom,virial); \
} \
} \
if (offset==0 && ii<inum) ans[ii]=f; \
if (eflag || vflag) { \
if (eflag!=2 && vflag!=2) { \
const int vwidth = simd_size(); \
const int voffset = tid & (simd_size() - 1); \
const int bnum = tid/simd_size(); \
int active_subgs = BLOCK_SIZE_X/simd_size(); \
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
if (bnum < active_subgs) { \
if (eflag) { \
simd_reduce_add2(vwidth, energy, e_coul); \
if (voffset==0) { \
red_acc[6][bnum] = energy; \
red_acc[7][bnum] = e_coul; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (voffset==0) \
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
} \
} \
\
__syncthreads(); \
if (tid < active_subgs) { \
if (eflag) { \
energy = red_acc[6][tid]; \
e_coul = red_acc[7][tid]; \
} \
if (vflag) \
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
} else { \
if (eflag) energy = e_coul = (acctyp)0; \
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
} \
} \
\
if (bnum == 0) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simd_reduce_add2(vwidth, energy, e_coul); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
}
#else
#define local_allocate_store_pair()
#define local_allocate_store_charge()
#define local_allocate_store_bio()
#define local_allocate_store_ellipse()
#define local_allocate_store_three()
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) ans[ii]=f;
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) ans[ii]=f;
#endif
#endif