From f398add7d5177cd8ab01e10b6852072aa2021408 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 10 Sep 2024 00:36:13 -0500 Subject: [PATCH] fixed the value of shift being the number of rows processed in each chunk (g2y_m * b2y) --- lib/gpu/lal_neighbor.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 62ab2b31d0..ba2a328130 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -586,21 +586,24 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); + // maximum number of blocks on the device const int max_num_blocks = 65535; int shift = 0; if (g2y < max_num_blocks) { _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); } else { - const int num_rounds = ceil(static_cast(g2y) / max_num_blocks); + // using a fixed number of blocks int g2y_m = 65534; - for (int i = 0; i < num_rounds; i++) { + // number of chunks needed for the whole transpose + const int num_chunks = ceil(static_cast(g2y) / g2y_m); + for (int i = 0; i < num_chunks; i++) { _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y); _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); - shift += g2y_m; + shift += g2y_m*b2y; } } - + time_transpose.stop(); }