Feb2021 GPU Package Update - GPU Package Files

This commit is contained in:
Michael Brown
2021-02-15 08:20:50 -08:00
parent 16004e8f45
commit e7e2d2323b
345 changed files with 13424 additions and 7708 deletions

View File

@ -40,170 +40,521 @@
nbor_begin+=offset; \
}
#if (ARCH < 300)
#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
i, numj, stride, nbor_end, nbor_begin) \
i=nbor_mem[ii]; \
nbor_begin=ii+nbor_stride; \
numj=nbor_mem[nbor_begin]; \
nbor_begin+=nbor_stride+ii*(t_per_atom-1); \
stride=fast_mul(t_per_atom,nbor_stride); \
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & \
(t_per_atom-1)); \
nbor_begin+=offset;
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[6][BLOCK_PAIR]; \
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=energy; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<4; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
energy=red_acc[3][tid]; \
if (vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
#if (SHUFFLE_AVAIL == 0)
#define simd_reduce_add1(width, local, offset, tid, one) \
local[0][tid]=one; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) local[0][tid] += local[0][tid+s]; \
} \
if (offset==0) one=local[0][tid];
#define simd_reduce_add2(width, local, offset, tid, one, two) \
local[0][tid]=one; \
local[1][tid]=two; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (offset==0) { \
int ei=ii; \
if (eflag>0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
one=local[0][tid]; \
two=local[1][tid]; \
}
#define simd_reduce_add3(width, local, offset, tid, one, two, three) \
local[0][tid]=one; \
local[1][tid]=two; \
local[2][tid]=three; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
local[2][tid] += local[2][tid+s]; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
three=local[2][tid]; \
}
#define simd_reduce_add6(width, local, offset, tid, one, two, three, \
four, five, six) \
local[0][tid]=one; \
local[1][tid]=two; \
local[2][tid]=three; \
local[3][tid]=four; \
local[4][tid]=five; \
local[5][tid]=six; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
local[2][tid] += local[2][tid+s]; \
local[3][tid] += local[3][tid+s]; \
local[4][tid] += local[4][tid+s]; \
local[5][tid] += local[5][tid+s]; \
} \
} \
if (offset==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
three=local[2][tid]; \
four=local[3][tid]; \
five=local[4][tid]; \
six=local[5][tid]; \
}
#define simd_reduce_arr(trip, width, local, offset, tid, arr) \
for (int r=0; r<trip; r++) \
local[r][tid]=arr[r]; \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (offset < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (offset==0) { \
for (int r=0; r<trip; r++) \
arr[r]=local[r][tid]; \
}
#define block_reduce_add1(width, local, tid, one) \
local[0][tid]=one; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) local[0][tid] += local[0][tid+s]; \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) local[0][tid] += local[0][tid+s]; \
} \
if (tid==0) one=local[0][tid]; \
}
#define block_reduce_add2(width, local, tid, one, two) \
local[0][tid]=one; \
local[1][tid]=two; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) { \
local[0][tid] += local[0][tid+s]; \
local[1][tid] += local[1][tid+s]; \
} \
} \
if (tid==0) { \
one=local[0][tid]; \
two=local[1][tid]; \
} \
}
#define block_reduce_arr(trip, width, local, tid, arr) \
for (int r=0; r<trip; r++) \
local[r][tid]=arr[r]; \
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
__syncthreads(); \
if (tid < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (tid<width) { \
for (unsigned int s=width/2; s>0; s>>=1) { \
simdsync(); \
if (tid < s) { \
for (int r=0; r<trip; r++) \
local[r][tid] += local[r][tid+s]; \
} \
} \
if (tid==0) { \
for (int r=0; r<trip; r++) \
arr[r]=local[r][tid]; \
} \
}
#define local_allocate_store_pair() \
__local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_charge() \
__local acctyp red_acc[6][BLOCK_PAIR];
#define local_allocate_store_bio() \
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
#define local_allocate_store_ellipse() \
__local acctyp red_acc[6][BLOCK_ELLIPSE];
#define local_allocate_store_three() \
__local acctyp red_acc[6][BLOCK_ELLIPSE];
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
if (EVFLAG && (vflag==2 || eflag==2)) { \
if (eflag) { \
simdsync(); \
simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \
} \
if (vflag) { \
simdsync(); \
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
} \
} \
} \
if (offset==0 && ii<inum) ans[ii]=f; \
if (EVFLAG && (eflag || vflag)) { \
int ei=BLOCK_ID_X; \
if (eflag!=2 && vflag!=2) { \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simdsync(); \
block_reduce_add1(simd_size(), red_acc, tid, energy); \
if (vflag) __syncthreads(); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simdsync(); \
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (EVFLAG && eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (EVFLAG && vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
ans[ii]=f; \
}
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
__local acctyp red_acc[6][BLOCK_PAIR]; \
red_acc[0][tid]=f.x; \
red_acc[1][tid]=f.y; \
red_acc[2][tid]=f.z; \
red_acc[3][tid]=energy; \
red_acc[4][tid]=e_coul; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<5; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
if (EVFLAG && (vflag==2 || eflag==2)) { \
if (eflag) { \
simdsync(); \
simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
} \
} \
f.x=red_acc[0][tid]; \
f.y=red_acc[1][tid]; \
f.z=red_acc[2][tid]; \
energy=red_acc[3][tid]; \
e_coul=red_acc[4][tid]; \
if (vflag>0) { \
for (int r=0; r<6; r++) \
red_acc[r][tid]=virial[r]; \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
if (offset < s) { \
for (int r=0; r<6; r++) \
red_acc[r][tid] += red_acc[r][tid+s]; \
} \
if (vflag) { \
simdsync(); \
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
} \
for (int r=0; r<6; r++) \
virial[r]=red_acc[r][tid]; \
} \
} \
if (offset==0) { \
int ei=ii; \
if (eflag>0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
if (offset==0 && ii<inum) ans[ii]=f; \
if (EVFLAG && (eflag || vflag)) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag!=2 && vflag!=2) { \
if (eflag) { \
simdsync(); \
block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \
if (vflag) __syncthreads(); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simdsync(); \
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (EVFLAG && eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (EVFLAG && vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
ans[ii]=f; \
}
#else
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
eflag, vflag, ans, engv) \
#define simd_reduce_add1(width, one) \
for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
#define simd_reduce_add2(width, one, two) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
}
#define simd_reduce_add3(width, one, two, three) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
three += shfl_down(three, s, width); \
}
#define simd_reduce_add6(width, one, two, three, four, five, six) \
for (unsigned int s=width/2; s>0; s>>=1) { \
one += shfl_down(one, s, width); \
two += shfl_down(two, s, width); \
three += shfl_down(three, s, width); \
four += shfl_down(four, s, width); \
five += shfl_down(five, s, width); \
six += shfl_down(six, s, width); \
}
#define simd_reduce_arr(trip, width, arr) \
for (unsigned int s=width/2; s>0; s>>=1) { \
for (int r=0; r<trip; r++) \
arr[r] += shfl_down(arr[r], s, width); \
}
#if (EVFLAG == 1)
#define local_allocate_store_pair() \
__local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_charge() \
__local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
#define local_allocate_store_bio() \
__local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
#define local_allocate_store_ellipse()
#define local_allocate_store_three() \
__local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (vflag==2 || eflag==2) { \
if (eflag) \
simd_reduce_add1(t_per_atom,energy); \
if (vflag) \
simd_reduce_arr(6, t_per_atom,virial); \
} \
} \
if (offset==0) { \
int ei=ii; \
if (eflag>0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
if (offset==0 && ii<inum) ans[ii]=f; \
if (eflag || vflag) { \
if (eflag!=2 && vflag!=2) { \
const int vwidth = simd_size(); \
const int voffset = tid & (simd_size() - 1); \
const int bnum = tid/simd_size(); \
int active_subgs = BLOCK_SIZE_X/simd_size(); \
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
if (bnum < active_subgs) { \
if (eflag) { \
simd_reduce_add1(vwidth, energy); \
if (voffset==0) red_acc[6][bnum] = energy; \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (voffset==0) \
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
} \
} \
\
__syncthreads(); \
if (tid < active_subgs) { \
if (eflag) energy = red_acc[6][tid]; \
if (vflag) \
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
} else { \
if (eflag) energy = (acctyp)0; \
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
} \
} \
\
if (bnum == 0) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simd_reduce_add1(vwidth, energy); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
ans[ii]=f; \
}
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
f.x += shfl_xor(f.x, s, t_per_atom); \
f.y += shfl_xor(f.y, s, t_per_atom); \
f.z += shfl_xor(f.z, s, t_per_atom); \
energy += shfl_xor(energy, s, t_per_atom); \
e_coul += shfl_xor(e_coul, s, t_per_atom); \
} \
if (vflag>0) { \
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
for (int r=0; r<6; r++) \
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
} \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (vflag==2 || eflag==2) { \
if (eflag) \
simd_reduce_add2(t_per_atom,energy,e_coul); \
if (vflag) \
simd_reduce_arr(6, t_per_atom,virial); \
} \
} \
if (offset==0) { \
int ei=ii; \
if (eflag>0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag>0) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
if (offset==0 && ii<inum) ans[ii]=f; \
if (eflag || vflag) { \
if (eflag!=2 && vflag!=2) { \
const int vwidth = simd_size(); \
const int voffset = tid & (simd_size() - 1); \
const int bnum = tid/simd_size(); \
int active_subgs = BLOCK_SIZE_X/simd_size(); \
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
if (bnum < active_subgs) { \
if (eflag) { \
simd_reduce_add2(vwidth, energy, e_coul); \
if (voffset==0) { \
red_acc[6][bnum] = energy; \
red_acc[7][bnum] = e_coul; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (voffset==0) \
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
} \
} \
\
__syncthreads(); \
if (tid < active_subgs) { \
if (eflag) { \
energy = red_acc[6][tid]; \
e_coul = red_acc[7][tid]; \
} \
if (vflag) \
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
} else { \
if (eflag) energy = e_coul = (acctyp)0; \
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
} \
} \
\
if (bnum == 0) { \
int ei=BLOCK_ID_X; \
const int ev_stride=NUM_BLOCKS_X; \
if (eflag) { \
simd_reduce_add2(vwidth, energy, e_coul); \
if (tid==0) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=ev_stride; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
if (vflag) { \
simd_reduce_arr(6, vwidth, virial); \
if (tid==0) { \
for (int r=0; r<6; r++) { \
engv[ei]=virial[r]*(acctyp)0.5; \
ei+=ev_stride; \
} \
} \
} \
} \
} else if (offset==0 && ii<inum) { \
int ei=ii; \
if (eflag) { \
engv[ei]=energy*(acctyp)0.5; \
ei+=inum; \
engv[ei]=e_coul*(acctyp)0.5; \
ei+=inum; \
} \
if (vflag) { \
for (int i=0; i<6; i++) { \
engv[ei]=virial[i]*(acctyp)0.5; \
ei+=inum; \
} \
} \
} \
ans[ii]=f; \
}
#else
#define local_allocate_store_pair()
#define local_allocate_store_charge()
#define local_allocate_store_bio()
#define local_allocate_store_ellipse()
#define local_allocate_store_three()
#define store_answers(f, energy, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) ans[ii]=f;
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
t_per_atom, offset, eflag, vflag, ans, engv) \
if (t_per_atom>1) \
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
if (offset==0 && ii<inum) ans[ii]=f;
#endif
#endif