Feb2021 GPU Package Update - GPU Package Files
This commit is contained in:
@ -40,170 +40,521 @@
|
||||
nbor_begin+=offset; \
|
||||
}
|
||||
|
||||
#if (ARCH < 300)
|
||||
#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset, \
|
||||
i, numj, stride, nbor_end, nbor_begin) \
|
||||
i=nbor_mem[ii]; \
|
||||
nbor_begin=ii+nbor_stride; \
|
||||
numj=nbor_mem[nbor_begin]; \
|
||||
nbor_begin+=nbor_stride+ii*(t_per_atom-1); \
|
||||
stride=fast_mul(t_per_atom,nbor_stride); \
|
||||
nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & \
|
||||
(t_per_atom-1)); \
|
||||
nbor_begin+=offset;
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
|
||||
eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR]; \
|
||||
red_acc[0][tid]=f.x; \
|
||||
red_acc[1][tid]=f.y; \
|
||||
red_acc[2][tid]=f.z; \
|
||||
red_acc[3][tid]=energy; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<4; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
f.x=red_acc[0][tid]; \
|
||||
f.y=red_acc[1][tid]; \
|
||||
f.z=red_acc[2][tid]; \
|
||||
energy=red_acc[3][tid]; \
|
||||
if (vflag>0) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid]=virial[r]; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r]=red_acc[r][tid]; \
|
||||
#if (SHUFFLE_AVAIL == 0)
|
||||
|
||||
#define simd_reduce_add1(width, local, offset, tid, one) \
|
||||
local[0][tid]=one; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) local[0][tid] += local[0][tid+s]; \
|
||||
} \
|
||||
if (offset==0) one=local[0][tid];
|
||||
|
||||
#define simd_reduce_add2(width, local, offset, tid, one, two) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
}
|
||||
|
||||
#define simd_reduce_add3(width, local, offset, tid, one, two, three) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
local[2][tid]=three; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
local[2][tid] += local[2][tid+s]; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
three=local[2][tid]; \
|
||||
}
|
||||
|
||||
#define simd_reduce_add6(width, local, offset, tid, one, two, three, \
|
||||
four, five, six) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
local[2][tid]=three; \
|
||||
local[3][tid]=four; \
|
||||
local[4][tid]=five; \
|
||||
local[5][tid]=six; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
local[2][tid] += local[2][tid+s]; \
|
||||
local[3][tid] += local[3][tid+s]; \
|
||||
local[4][tid] += local[4][tid+s]; \
|
||||
local[5][tid] += local[5][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
three=local[2][tid]; \
|
||||
four=local[3][tid]; \
|
||||
five=local[4][tid]; \
|
||||
six=local[5][tid]; \
|
||||
}
|
||||
|
||||
#define simd_reduce_arr(trip, width, local, offset, tid, arr) \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid]=arr[r]; \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid] += local[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
arr[r]=local[r][tid]; \
|
||||
}
|
||||
|
||||
#define block_reduce_add1(width, local, tid, one) \
|
||||
local[0][tid]=one; \
|
||||
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
|
||||
__syncthreads(); \
|
||||
if (tid < s) local[0][tid] += local[0][tid+s]; \
|
||||
} \
|
||||
if (tid<width) { \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (tid < s) local[0][tid] += local[0][tid+s]; \
|
||||
} \
|
||||
if (tid==0) one=local[0][tid]; \
|
||||
}
|
||||
|
||||
#define block_reduce_add2(width, local, tid, one, two) \
|
||||
local[0][tid]=one; \
|
||||
local[1][tid]=two; \
|
||||
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
|
||||
__syncthreads(); \
|
||||
if (tid < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid<width) { \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (tid < s) { \
|
||||
local[0][tid] += local[0][tid+s]; \
|
||||
local[1][tid] += local[1][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid==0) { \
|
||||
one=local[0][tid]; \
|
||||
two=local[1][tid]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define block_reduce_arr(trip, width, local, tid, arr) \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid]=arr[r]; \
|
||||
for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) { \
|
||||
__syncthreads(); \
|
||||
if (tid < s) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid] += local[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid<width) { \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
simdsync(); \
|
||||
if (tid < s) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
local[r][tid] += local[r][tid+s]; \
|
||||
} \
|
||||
} \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
arr[r]=local[r][tid]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define local_allocate_store_pair() \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
#define local_allocate_store_charge() \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
#define local_allocate_store_bio() \
|
||||
__local acctyp red_acc[6][BLOCK_BIO_PAIR];
|
||||
#define local_allocate_store_ellipse() \
|
||||
__local acctyp red_acc[6][BLOCK_ELLIPSE];
|
||||
#define local_allocate_store_three() \
|
||||
__local acctyp red_acc[6][BLOCK_ELLIPSE];
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
|
||||
if (EVFLAG && (vflag==2 || eflag==2)) { \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy); \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (EVFLAG && (eflag || vflag)) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_add1(simd_size(), red_acc, tid, energy); \
|
||||
if (vflag) __syncthreads(); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (EVFLAG && eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (EVFLAG && vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
}
|
||||
|
||||
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
__local acctyp red_acc[6][BLOCK_PAIR]; \
|
||||
red_acc[0][tid]=f.x; \
|
||||
red_acc[1][tid]=f.y; \
|
||||
red_acc[2][tid]=f.z; \
|
||||
red_acc[3][tid]=energy; \
|
||||
red_acc[4][tid]=e_coul; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<5; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \
|
||||
if (EVFLAG && (vflag==2 || eflag==2)) { \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
|
||||
} \
|
||||
} \
|
||||
f.x=red_acc[0][tid]; \
|
||||
f.y=red_acc[1][tid]; \
|
||||
f.z=red_acc[2][tid]; \
|
||||
energy=red_acc[3][tid]; \
|
||||
e_coul=red_acc[4][tid]; \
|
||||
if (vflag>0) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid]=virial[r]; \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
if (offset < s) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
red_acc[r][tid] += red_acc[r][tid+s]; \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial); \
|
||||
} \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r]=red_acc[r][tid]; \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (EVFLAG && (eflag || vflag)) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
if (eflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul); \
|
||||
if (vflag) __syncthreads(); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simdsync(); \
|
||||
block_reduce_arr(6, simd_size(), red_acc, tid, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (EVFLAG && eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (EVFLAG && vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
|
||||
eflag, vflag, ans, engv) \
|
||||
#define simd_reduce_add1(width, one) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
|
||||
|
||||
#define simd_reduce_add2(width, one, two) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
one += shfl_down(one, s, width); \
|
||||
two += shfl_down(two, s, width); \
|
||||
}
|
||||
|
||||
#define simd_reduce_add3(width, one, two, three) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
one += shfl_down(one, s, width); \
|
||||
two += shfl_down(two, s, width); \
|
||||
three += shfl_down(three, s, width); \
|
||||
}
|
||||
|
||||
#define simd_reduce_add6(width, one, two, three, four, five, six) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
one += shfl_down(one, s, width); \
|
||||
two += shfl_down(two, s, width); \
|
||||
three += shfl_down(three, s, width); \
|
||||
four += shfl_down(four, s, width); \
|
||||
five += shfl_down(five, s, width); \
|
||||
six += shfl_down(six, s, width); \
|
||||
}
|
||||
|
||||
#define simd_reduce_arr(trip, width, arr) \
|
||||
for (unsigned int s=width/2; s>0; s>>=1) { \
|
||||
for (int r=0; r<trip; r++) \
|
||||
arr[r] += shfl_down(arr[r], s, width); \
|
||||
}
|
||||
|
||||
#if (EVFLAG == 1)
|
||||
|
||||
#define local_allocate_store_pair() \
|
||||
__local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
|
||||
#define local_allocate_store_charge() \
|
||||
__local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
|
||||
#define local_allocate_store_bio() \
|
||||
__local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
|
||||
#define local_allocate_store_ellipse()
|
||||
#define local_allocate_store_three() \
|
||||
__local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
f.x += shfl_xor(f.x, s, t_per_atom); \
|
||||
f.y += shfl_xor(f.y, s, t_per_atom); \
|
||||
f.z += shfl_xor(f.z, s, t_per_atom); \
|
||||
energy += shfl_xor(energy, s, t_per_atom); \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
|
||||
} \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (vflag==2 || eflag==2) { \
|
||||
if (eflag) \
|
||||
simd_reduce_add1(t_per_atom,energy); \
|
||||
if (vflag) \
|
||||
simd_reduce_arr(6, t_per_atom,virial); \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (eflag || vflag) { \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
const int vwidth = simd_size(); \
|
||||
const int voffset = tid & (simd_size() - 1); \
|
||||
const int bnum = tid/simd_size(); \
|
||||
int active_subgs = BLOCK_SIZE_X/simd_size(); \
|
||||
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
|
||||
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
|
||||
if (bnum < active_subgs) { \
|
||||
if (eflag) { \
|
||||
simd_reduce_add1(vwidth, energy); \
|
||||
if (voffset==0) red_acc[6][bnum] = energy; \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (voffset==0) \
|
||||
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
__syncthreads(); \
|
||||
if (tid < active_subgs) { \
|
||||
if (eflag) energy = red_acc[6][tid]; \
|
||||
if (vflag) \
|
||||
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
|
||||
} else { \
|
||||
if (eflag) energy = (acctyp)0; \
|
||||
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (bnum == 0) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag) { \
|
||||
simd_reduce_add1(vwidth, energy); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
}
|
||||
|
||||
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
f.x += shfl_xor(f.x, s, t_per_atom); \
|
||||
f.y += shfl_xor(f.y, s, t_per_atom); \
|
||||
f.z += shfl_xor(f.z, s, t_per_atom); \
|
||||
energy += shfl_xor(energy, s, t_per_atom); \
|
||||
e_coul += shfl_xor(e_coul, s, t_per_atom); \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \
|
||||
for (int r=0; r<6; r++) \
|
||||
virial[r] += shfl_xor(virial[r], s, t_per_atom); \
|
||||
} \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (vflag==2 || eflag==2) { \
|
||||
if (eflag) \
|
||||
simd_reduce_add2(t_per_atom,energy,e_coul); \
|
||||
if (vflag) \
|
||||
simd_reduce_arr(6, t_per_atom,virial); \
|
||||
} \
|
||||
} \
|
||||
if (offset==0) { \
|
||||
int ei=ii; \
|
||||
if (eflag>0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag>0) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
if (offset==0 && ii<inum) ans[ii]=f; \
|
||||
if (eflag || vflag) { \
|
||||
if (eflag!=2 && vflag!=2) { \
|
||||
const int vwidth = simd_size(); \
|
||||
const int voffset = tid & (simd_size() - 1); \
|
||||
const int bnum = tid/simd_size(); \
|
||||
int active_subgs = BLOCK_SIZE_X/simd_size(); \
|
||||
for ( ; active_subgs > 1; active_subgs /= vwidth) { \
|
||||
if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads(); \
|
||||
if (bnum < active_subgs) { \
|
||||
if (eflag) { \
|
||||
simd_reduce_add2(vwidth, energy, e_coul); \
|
||||
if (voffset==0) { \
|
||||
red_acc[6][bnum] = energy; \
|
||||
red_acc[7][bnum] = e_coul; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (voffset==0) \
|
||||
for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
__syncthreads(); \
|
||||
if (tid < active_subgs) { \
|
||||
if (eflag) { \
|
||||
energy = red_acc[6][tid]; \
|
||||
e_coul = red_acc[7][tid]; \
|
||||
} \
|
||||
if (vflag) \
|
||||
for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid]; \
|
||||
} else { \
|
||||
if (eflag) energy = e_coul = (acctyp)0; \
|
||||
if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
if (bnum == 0) { \
|
||||
int ei=BLOCK_ID_X; \
|
||||
const int ev_stride=NUM_BLOCKS_X; \
|
||||
if (eflag) { \
|
||||
simd_reduce_add2(vwidth, energy, e_coul); \
|
||||
if (tid==0) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
if (vflag) { \
|
||||
simd_reduce_arr(6, vwidth, virial); \
|
||||
if (tid==0) { \
|
||||
for (int r=0; r<6; r++) { \
|
||||
engv[ei]=virial[r]*(acctyp)0.5; \
|
||||
ei+=ev_stride; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} else if (offset==0 && ii<inum) { \
|
||||
int ei=ii; \
|
||||
if (eflag) { \
|
||||
engv[ei]=energy*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
engv[ei]=e_coul*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
if (vflag) { \
|
||||
for (int i=0; i<6; i++) { \
|
||||
engv[ei]=virial[i]*(acctyp)0.5; \
|
||||
ei+=inum; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
ans[ii]=f; \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define local_allocate_store_pair()
|
||||
#define local_allocate_store_charge()
|
||||
#define local_allocate_store_bio()
|
||||
#define local_allocate_store_ellipse()
|
||||
#define local_allocate_store_three()
|
||||
|
||||
#define store_answers(f, energy, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) ans[ii]=f;
|
||||
|
||||
#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid, \
|
||||
t_per_atom, offset, eflag, vflag, ans, engv) \
|
||||
if (t_per_atom>1) \
|
||||
simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \
|
||||
if (offset==0 && ii<inum) ans[ii]=f;
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user