Misc Improvements to GPU Package
- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
This commit is contained in:
@ -52,7 +52,7 @@ _texture_2d( pos_tex,int4);
|
||||
compute the id of the cell where the atoms belong to
|
||||
x: atom coordinates
|
||||
cell_id: cell ids
|
||||
particle_id:
|
||||
particle_id:
|
||||
boxlo[0-2]: the lower left corner of the local box
|
||||
ncell[xyz]: the number of cells in xyz dims
|
||||
i_cell_size is the inverse cell size
|
||||
@ -489,6 +489,10 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
|
||||
|
||||
#endif
|
||||
|
||||
#define SPECIAL_DATA_PRELOAD_SIZE 3
|
||||
#define UNROLL_FACTOR_LIST 4
|
||||
#define UNROLL_FACTOR_SPECIAL 2
|
||||
|
||||
__kernel void kernel_special(__global int *dev_nbor,
|
||||
__global int *host_nbor_list,
|
||||
const __global int *host_numj,
|
||||
@ -526,23 +530,68 @@ __kernel void kernel_special(__global int *dev_nbor,
|
||||
list_end=list+fast_mul(numj,stride);
|
||||
}
|
||||
|
||||
for ( ; list<list_end; list+=stride) {
|
||||
int nbor=*list;
|
||||
tagint jtag=tag[nbor];
|
||||
#if SPECIAL_DATA_PRELOAD_SIZE > 0
|
||||
tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE];
|
||||
for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) {
|
||||
special_preload[j] = special[ii + i*nt];
|
||||
}
|
||||
#endif
|
||||
|
||||
int offset=ii;
|
||||
for (int i=0; i<n3; i++) {
|
||||
if (special[offset]==jtag) {
|
||||
int which = 1;
|
||||
if (i>=n1)
|
||||
which++;
|
||||
if (i>=n2)
|
||||
which++;
|
||||
nbor=nbor ^ (which << SBBITS);
|
||||
*list=nbor;
|
||||
for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
|
||||
int nbor[UNROLL_FACTOR_LIST];
|
||||
tagint jtag[UNROLL_FACTOR_LIST];
|
||||
__global int* list_addr[UNROLL_FACTOR_LIST];
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
list_addr[l] = list + l*stride;
|
||||
nbor[l] = *list_addr[l];
|
||||
}
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
jtag[l] = tag[nbor[l]];
|
||||
}
|
||||
|
||||
for (int i=0, j=0; i<n3; i+=UNROLL_FACTOR_SPECIAL, j++) {
|
||||
tagint special_data[UNROLL_FACTOR_SPECIAL];
|
||||
int which[UNROLL_FACTOR_SPECIAL];
|
||||
|
||||
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
|
||||
which[c] = 1;
|
||||
if (i + c < n3)
|
||||
{
|
||||
#if SPECIAL_DATA_PRELOAD_SIZE > 0
|
||||
if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) {
|
||||
special_data[c] = special_preload[j];
|
||||
}
|
||||
else
|
||||
#endif
|
||||
special_data[c] = special[ii + (i+c)*nt];
|
||||
}
|
||||
}
|
||||
offset+=nt;
|
||||
|
||||
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
|
||||
if (i+k >= n1) {
|
||||
which[k]++;
|
||||
}
|
||||
}
|
||||
for (int k=0; k<UNROLL_FACTOR_SPECIAL; k++) {
|
||||
if (i+k >= n2) {
|
||||
which[k]++;
|
||||
}
|
||||
which[k] <<= SBBITS;
|
||||
}
|
||||
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
|
||||
if (i + c < n3) {
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
if (special_data[c] == jtag[l]) {
|
||||
nbor[l]=nbor[l] ^ which[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||
*list_addr[l] = nbor[l];
|
||||
}
|
||||
}
|
||||
} // if ii
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user