Misc Improvements to GPU Package
- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
This commit is contained in:
@ -86,7 +86,7 @@
|
||||
ap1+=astride; \
|
||||
} \
|
||||
} \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -131,7 +131,7 @@
|
||||
ap1+=astride; \
|
||||
} \
|
||||
} \
|
||||
acctyp4 old=ans[ii]; \
|
||||
acctyp3 old=ans[ii]; \
|
||||
old.x+=f.x; \
|
||||
old.y+=f.y; \
|
||||
old.z+=f.z; \
|
||||
@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
||||
const int ntypes,
|
||||
const __global int *dev_nbor,
|
||||
const int stride,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
const int astride,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
||||
const numtyp solv_f_r =
|
||||
(numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
|
||||
|
||||
acctyp4 f, tor;
|
||||
acctyp3 f, tor;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_nbor+nbor+n_stride);
|
||||
int j=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
@ -409,7 +410,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
const int ntypes,
|
||||
const __global int *dev_nbor,
|
||||
const int stride,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag,
|
||||
@ -435,7 +436,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
const numtyp solv_f_r =
|
||||
(numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -454,6 +455,7 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_nbor+nbor+n_stride);
|
||||
int i=dev_nbor[nbor];
|
||||
factor_lj = sp_lj[sbmask(i)];
|
||||
i &= NEIGHMASK;
|
||||
@ -610,7 +612,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict gum,
|
||||
const int stride,
|
||||
const __global int *dev_ij,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag, const int start,
|
||||
@ -628,7 +630,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
||||
sp_lj[2]=gum[2];
|
||||
sp_lj[3]=gum[3];
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -647,6 +649,7 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_ij+nbor+n_stride);
|
||||
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
@ -697,7 +700,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp *restrict gum,
|
||||
const int stride,
|
||||
const __global int *dev_ij,
|
||||
__global acctyp4 *restrict ans,
|
||||
__global acctyp3 *restrict ans,
|
||||
__global acctyp *restrict engv,
|
||||
__global int *restrict err_flag,
|
||||
const int eflag, const int vflag,
|
||||
@ -721,7 +724,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp4 f;
|
||||
acctyp3 f;
|
||||
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
|
||||
acctyp energy, virial[6];
|
||||
if (EVFLAG) {
|
||||
@ -743,6 +746,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<nbor_end; nbor+=n_stride) {
|
||||
ucl_prefetch(dev_ij+nbor+n_stride);
|
||||
|
||||
int j=dev_ij[nbor];
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
|
||||
Reference in New Issue
Block a user