Merge pull request #4319 from ndtrung81/gpu_max_num_blocks
Enable large numbers of atoms per proc with the GPU package
This commit is contained in:
@ -586,8 +586,25 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||||||
const int b2y=_block_cell_2d;
|
const int b2y=_block_cell_2d;
|
||||||
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
|
||||||
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
|
||||||
|
// the maximum number of blocks on the device is typically 65535
|
||||||
|
// in principle we can use a lower number to have more resource per block 32768
|
||||||
|
const int max_num_blocks = 65535;
|
||||||
|
int shift = 0;
|
||||||
|
if (g2y < max_num_blocks) {
|
||||||
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
|
||||||
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
|
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
|
||||||
|
} else {
|
||||||
|
// using a fixed number of blocks
|
||||||
|
int g2y_m = max_num_blocks;
|
||||||
|
_shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y);
|
||||||
|
// number of chunks needed for the whole transpose
|
||||||
|
const int num_chunks = ceil(static_cast<double>(g2y) / g2y_m);
|
||||||
|
for (int i = 0; i < num_chunks; i++) {
|
||||||
|
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
|
||||||
|
shift += g2y_m*b2y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
time_transpose.stop();
|
time_transpose.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -147,7 +147,7 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
|
|||||||
|
|
||||||
__kernel void transpose(__global tagint *restrict out,
|
__kernel void transpose(__global tagint *restrict out,
|
||||||
const __global tagint *restrict in,
|
const __global tagint *restrict in,
|
||||||
int columns_in, int rows_in)
|
int columns_in, int rows_in, int shift)
|
||||||
{
|
{
|
||||||
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
__local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
|
||||||
|
|
||||||
@ -158,15 +158,15 @@ __kernel void transpose(__global tagint *restrict out,
|
|||||||
|
|
||||||
unsigned i=bi*BLOCK_CELL_2D+ti;
|
unsigned i=bi*BLOCK_CELL_2D+ti;
|
||||||
unsigned j=bj*BLOCK_CELL_2D+tj;
|
unsigned j=bj*BLOCK_CELL_2D+tj;
|
||||||
if ((i<columns_in) && (j<rows_in))
|
if ((i<columns_in) && (j+shift<rows_in))
|
||||||
block[tj][ti]=in[j*columns_in+i];
|
block[tj][ti]=in[(j+shift)*columns_in+i];
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
i=bj*BLOCK_CELL_2D+ti;
|
i=bj*BLOCK_CELL_2D+ti;
|
||||||
j=bi*BLOCK_CELL_2D+tj;
|
j=bi*BLOCK_CELL_2D+tj;
|
||||||
if ((i<rows_in) && (j<columns_in))
|
if ((i+shift<rows_in) && (j<columns_in))
|
||||||
out[j*rows_in+i] = block[ti][tj];
|
out[j*rows_in+i+shift] = block[ti][tj];
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef LAL_USE_OLD_NEIGHBOR
|
#ifndef LAL_USE_OLD_NEIGHBOR
|
||||||
|
|||||||
Reference in New Issue
Block a user