From 23bdc5ddc2b8d7116b3822a744d797900ebd43f7 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 10 Sep 2024 00:18:50 -0500 Subject: [PATCH 1/4] Allowed number of blocks greater than 65,535 for k_transpose --- lib/gpu/lal_neighbor.cpp | 17 +++++++++++++++-- lib/gpu/lal_neighbor_gpu.cu | 10 +++++----- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 10816e2fa6..62ab2b31d0 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -586,8 +586,21 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); - _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); - _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt); + const int max_num_blocks = 65535; + int shift = 0; + if (g2y < max_num_blocks) { + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); + } else { + const int num_rounds = ceil(static_cast(g2y) / max_num_blocks); + int g2y_m = 65534; + for (int i = 0; i < num_rounds; i++) { + _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y); + _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); + shift += g2y_m; + } + } + time_transpose.stop(); } diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index a7506fc5c3..7d0941ccd5 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -147,7 +147,7 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id, __kernel void transpose(__global tagint *restrict out, const __global tagint *restrict in, - int columns_in, int rows_in) + int columns_in, int rows_in, int shift) { __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; @@ -158,15 +158,15 @@ __kernel void transpose(__global tagint *restrict out, unsigned i=bi*BLOCK_CELL_2D+ti; unsigned j=bj*BLOCK_CELL_2D+tj; - if ((i Date: Tue, 10 Sep 2024 00:36:13 -0500 Subject: [PATCH 2/4] fixed the value of shift being the number of rows processed in each chunk (g2y_m * b2y) --- lib/gpu/lal_neighbor.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 62ab2b31d0..ba2a328130 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -586,21 +586,24 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); + // maximum number of blocks on the device const int max_num_blocks = 65535; int shift = 0; if (g2y < max_num_blocks) { _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); } else { - const int num_rounds = ceil(static_cast(g2y) / max_num_blocks); + // using a fixed number of blocks int g2y_m = 65534; - for (int i = 0; i < num_rounds; i++) { + // number of chunks needed for the whole transpose + const int num_chunks = ceil(static_cast(g2y) / g2y_m); + for (int i = 0; i < num_chunks; i++) { _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y); _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); - shift += g2y_m; + shift += g2y_m*b2y; } } - + time_transpose.stop(); } From c63c88f8b6804f58399429dd3fd52bcf634109ee Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 10 Sep 2024 08:58:42 -0500 Subject: [PATCH 3/4] reduced the max number of blocks for each transpose --- lib/gpu/lal_neighbor.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index ba2a328130..101e92953c 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -586,19 +586,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); - // maximum number of blocks on the device - const int max_num_blocks = 65535; + // the maximum number of blocks on the device is typically 65535 + // we can use a lower number to have more resource per block + const int max_num_blocks = 32768; int shift = 0; if (g2y < max_num_blocks) { _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); } else { // using a fixed number of blocks - int g2y_m = 65534; + int g2y_m = max_num_blocks; + _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y); // number of chunks needed for the whole transpose const int num_chunks = ceil(static_cast(g2y) / g2y_m); - for (int i = 0; i < num_chunks; i++) { - _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y); + for (int i = 0; i < num_chunks; i++) { _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); shift += g2y_m*b2y; } From b16bb27184cb873ff9da8136d99f9043b6c19e91 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 10 Sep 2024 09:47:07 -0500 Subject: [PATCH 4/4] revert to using the max number of blocks on device for each pass, as the number of blocks (32767 or 65535) already saturates the number of SMs anyway --- lib/gpu/lal_neighbor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 101e92953c..051f55f0a3 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -587,8 +587,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); // the maximum number of blocks on the device is typically 65535 - // we can use a lower number to have more resource per block - const int max_num_blocks = 32768; + // in principle we can use a lower number to have more resource per block 32768 + const int max_num_blocks = 65535; int shift = 0; if (g2y < max_num_blocks) { _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);