Misc Improvements to GPU Package

- Optimizations for molecular systems
-   Improved kernel performance and greater CPU overlap
- Reduced GPU to CPU communications for discrete devices
- Switch classic Intel makefiles to use LLVM-based compilers
- Prefetch optimizations supported for OpenCL
- Optimized data repack for quaternions
This commit is contained in:
W. Michael Brown
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions

View File

@ -33,7 +33,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp3 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
@ -49,7 +49,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
sp_lj[2]=sp_lj_in[2];
sp_lj[3]=sp_lj_in[3];
acctyp4 f;
acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6];
if (EVFLAG) {
@ -68,6 +68,7 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];
@ -120,7 +121,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
const __global numtyp *restrict sp_lj_in,
const __global int *dev_nbor,
const __global int *dev_packed,
__global acctyp4 *restrict ans,
__global acctyp3 *restrict ans,
__global acctyp *restrict engv,
const int eflag, const int vflag, const int inum,
const int nbor_pitch, const int t_per_atom) {
@ -141,7 +142,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
mor2[tid]=mor2_in[tid];
}
acctyp4 f;
acctyp3 f;
f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
acctyp energy, virial[6];
if (EVFLAG) {
@ -163,6 +164,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
numtyp factor_lj;
for ( ; nbor<nbor_end; nbor+=n_stride) {
ucl_prefetch(dev_packed+nbor+n_stride);
int j=dev_packed[nbor];
factor_lj = sp_lj[sbmask(j)];