From 4a11b966352f35f151fc6e376ac991a64c61cf54 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 10 Sep 2024 23:31:37 -0400 Subject: [PATCH] update with upstream --- lib/gpu/lal_neighbor.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 10816e2fa6..288415e0e7 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -586,8 +586,25 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int b2y=_block_cell_2d; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); - _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); - _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt); + // the maximum number of blocks on the device is typically 65535 + // in principle we can use a lower number to have more resource per block 32768 + const int max_num_blocks = 65535; + int shift = 0; + if (g2y < max_num_blocks) { + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); + } else { + // using a fixed number of blocks + int g2y_m = max_num_blocks; + _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y); + // number of chunks needed for the whole transpose + const int num_chunks = ceil(static_cast(g2y) / g2y_m); + for (int i = 0; i < num_chunks; i++) { + _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift); + shift += g2y_m*b2y; + } + } + time_transpose.stop(); }