GPU Package: Fixing out of bounds memory access issues with special kernel unroll optimizations.
This commit is contained in:
@ -507,27 +507,27 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||||||
|
|
||||||
if (ii<nt) {
|
if (ii<nt) {
|
||||||
int stride;
|
int stride;
|
||||||
__global int *list, *list_end;
|
__global int *list;
|
||||||
|
|
||||||
int n1=nspecial[ii*3];
|
int n1=nspecial[ii*3];
|
||||||
int n2=nspecial[ii*3+1];
|
int n2=nspecial[ii*3+1];
|
||||||
int n3=nspecial[ii*3+2];
|
int n3=nspecial[ii*3+2];
|
||||||
|
|
||||||
int numj;
|
int myj;
|
||||||
if (ii < inum) {
|
if (ii < inum) {
|
||||||
stride=inum;
|
stride=inum;
|
||||||
list=dev_nbor+stride+ii;
|
list=dev_nbor+stride+ii;
|
||||||
numj=*list;
|
int numj=*list;
|
||||||
list+=stride+fast_mul(ii,t_per_atom-1);
|
list+=stride+fast_mul(ii,t_per_atom-1);
|
||||||
stride=fast_mul(inum,t_per_atom);
|
stride=fast_mul(inum,t_per_atom);
|
||||||
int njt=numj/t_per_atom;
|
myj=numj/t_per_atom;
|
||||||
list_end=list+fast_mul(njt,stride)+(numj & (t_per_atom-1));
|
if (offset < (numj & (t_per_atom-1)))
|
||||||
|
myj++;
|
||||||
list+=offset;
|
list+=offset;
|
||||||
} else {
|
} else {
|
||||||
stride=1;
|
stride=1;
|
||||||
list=host_nbor_list+(ii-inum)*max_nbors;
|
list=host_nbor_list+(ii-inum)*max_nbors;
|
||||||
numj=host_numj[ii-inum];
|
myj=host_numj[ii-inum];
|
||||||
list_end=list+fast_mul(numj,stride);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if SPECIAL_DATA_PRELOAD_SIZE > 0
|
#if SPECIAL_DATA_PRELOAD_SIZE > 0
|
||||||
@ -537,15 +537,18 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for ( ; list<list_end; list+=UNROLL_FACTOR_LIST * stride) {
|
for (int m=0; m<myj; m+=UNROLL_FACTOR_LIST) {
|
||||||
int nbor[UNROLL_FACTOR_LIST];
|
int nbor[UNROLL_FACTOR_LIST];
|
||||||
tagint jtag[UNROLL_FACTOR_LIST];
|
tagint jtag[UNROLL_FACTOR_LIST];
|
||||||
__global int* list_addr[UNROLL_FACTOR_LIST];
|
__global int* list_addr[UNROLL_FACTOR_LIST];
|
||||||
|
int lmax = myj - m;
|
||||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||||
list_addr[l] = list + l*stride;
|
list_addr[l] = list + l*stride;
|
||||||
|
if (l < lmax)
|
||||||
nbor[l] = *list_addr[l];
|
nbor[l] = *list_addr[l];
|
||||||
}
|
}
|
||||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||||
|
if (l < lmax)
|
||||||
jtag[l] = tag[nbor[l]];
|
jtag[l] = tag[nbor[l]];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -581,7 +584,7 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||||||
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
|
for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) {
|
||||||
if (i + c < n3) {
|
if (i + c < n3) {
|
||||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||||
if (special_data[c] == jtag[l]) {
|
if (l < lmax && special_data[c] == jtag[l]) {
|
||||||
nbor[l]=nbor[l] ^ which[c];
|
nbor[l]=nbor[l] ^ which[c];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -589,9 +592,10 @@ __kernel void kernel_special(__global int *dev_nbor,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
for (int l=0; l<UNROLL_FACTOR_LIST; l++) {
|
||||||
|
if (l < lmax)
|
||||||
*list_addr[l] = nbor[l];
|
*list_addr[l] = nbor[l];
|
||||||
}
|
}
|
||||||
|
list+=UNROLL_FACTOR_LIST * stride;
|
||||||
}
|
}
|
||||||
} // if ii
|
} // if ii
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user