Misc Improvements to GPU Package

- Optimizations for molecular systems
-   Improved kernel performance and greater CPU overlap
- Reduced GPU to CPU communications for discrete devices
- Switch classic Intel makefiles to use LLVM-based compilers
- Prefetch optimizations supported for OpenCL
- Optimized data repack for quaternions
This commit is contained in:
W. Michael Brown
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions

View File

@ -52,7 +52,7 @@ _texture_2d( pos_tex,int4);
compute the id of the cell where the atoms belong to
x: atom coordinates
cell_id: cell ids
particle_id:
particle_id:
boxlo[0-2]: the lower left corner of the local box
ncell[xyz]: the number of cells in xyz dims
i_cell_size is the inverse cell size
@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
#endif
#define SPECIAL_DATA_PRELOAD_SIZE 3
#define UNROLL_FACTOR_LIST 4
#define UNROLL_FACTOR_SPECIAL 2
__kernel void kernel_special(__global int *dev_nbor,
__global int *host_nbor_list,
const __global int *host_numj,
@ -526,23 +530,68 @@ __kernel void kernel_special(__global int *dev_nbor,
list_end=list+fast_mul(numj,stride);
}
for ( ; list<list_end; list+=stride) {
int nbor=*list;
tagint jtag=tag[nbor];
#if SPECIAL_DATA_PRELOAD_SIZE > 0
tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
special_preload[j] = special[ii + i*nt];
}
#endif
int offset=ii;
for (int i=0; i<n3; i++) {
if (special[offset]==jtag) {
int which = 1;
if (i>=n1)
which++;
if (i>=n2)
which++;
nbor=nbor ^ (which << SBBITS);
*list=nbor;
for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
int nbor[UNROLL_FACTOR_LIST];
tagint jtag[UNROLL_FACTOR_LIST];
__global int* list_addr[UNROLL_FACTOR_LIST];
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
list_addr[l] = list + l*stride;
nbor[l] = *list_addr[l];
}
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
jtag[l] = tag[nbor[l]];
}
for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
tagint special_data[UNROLL_FACTOR_SPECIAL];
int which[UNROLL_FACTOR_SPECIAL];
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
which[c] = 1;
if (i + c < n3)
{
#if SPECIAL_DATA_PRELOAD_SIZE > 0
if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
special_data[c] = special_preload[j];
}
else
#endif
special_data[c] = special[ii + (i+c)*nt];
}
}
offset+=nt;
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
if (i+k >= n1) {
which[k]++;
}
}
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
if (i+k >= n2) {
which[k]++;
}
which[k] <<= SBBITS;
}
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
if (i + c < n3) {
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
if (special_data[c] == jtag[l]) {
nbor[l]=nbor[l] ^ which[c];
}
}
}
}
}
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
*list_addr[l] = nbor[l];
}
}
} // if ii
}