From 23bdc5ddc2b8d7116b3822a744d797900ebd43f7 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 10 Sep 2024 00:18:50 -0500
Subject: [PATCH 1/4] Allowed number of blocks greater than 65,535 for
 k_transpose

---
 lib/gpu/lal_neighbor.cpp    | 17 +++++++++++++++--
 lib/gpu/lal_neighbor_gpu.cu | 10 +++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 10816e2fa6..62ab2b31d0 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -586,8 +586,21 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const int b2y=_block_cell_2d;
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt);
+    const int max_num_blocks = 65535;
+    int shift = 0;
+    if (g2y < max_num_blocks) {
+      _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+      _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
+    } else {
+      const int num_rounds = ceil(static_cast<double>(g2y) / max_num_blocks);
+      int g2y_m = 65534;
+      for (int i = 0; i < num_rounds; i++) {
+        _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y);
+        _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
+        shift += g2y_m;
+      }
+    }
+    
     time_transpose.stop();
   }
 
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index a7506fc5c3..7d0941ccd5 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -147,7 +147,7 @@ __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
 
 __kernel void transpose(__global tagint *restrict out,
                         const __global tagint *restrict in,
-                        int columns_in, int rows_in)
+                        int columns_in, int rows_in, int shift)
 {
   __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 
@@ -158,15 +158,15 @@ __kernel void transpose(__global tagint *restrict out,
 
   unsigned i=bi*BLOCK_CELL_2D+ti;
   unsigned j=bj*BLOCK_CELL_2D+tj;
-  if ((i<columns_in) && (j<rows_in))
-    block[tj][ti]=in[j*columns_in+i];
+  if ((i<columns_in) && (j+shift<rows_in))
+    block[tj][ti]=in[(j+shift)*columns_in+i];
 
    __syncthreads();
 
   i=bj*BLOCK_CELL_2D+ti;
   j=bi*BLOCK_CELL_2D+tj;
-  if ((i<rows_in) && (j<columns_in))
-    out[j*rows_in+i] = block[ti][tj];
+  if ((i+shift<rows_in) && (j<columns_in))
+    out[j*rows_in+i+shift] = block[ti][tj];
 }
 
 #ifndef LAL_USE_OLD_NEIGHBOR

From f398add7d5177cd8ab01e10b6852072aa2021408 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 10 Sep 2024 00:36:13 -0500
Subject: [PATCH 2/4] fixed the value of shift being the number of rows
 processed in each chunk (g2y_m * b2y)

---
 lib/gpu/lal_neighbor.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 62ab2b31d0..ba2a328130 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -586,21 +586,24 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const int b2y=_block_cell_2d;
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
+    // maximum number of blocks on the device
     const int max_num_blocks = 65535;
     int shift = 0;
     if (g2y < max_num_blocks) {
       _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
       _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
     } else {
-      const int num_rounds = ceil(static_cast<double>(g2y) / max_num_blocks);
+      // using a fixed number of blocks
       int g2y_m = 65534;
-      for (int i = 0; i < num_rounds; i++) {
+      // number of chunks needed for the whole transpose
+      const int num_chunks = ceil(static_cast<double>(g2y) / g2y_m);
+      for (int i = 0; i < num_chunks; i++) {
         _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y);
         _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
-        shift += g2y_m;
+        shift += g2y_m*b2y;
       }
     }
-    
+
     time_transpose.stop();
   }
 

From c63c88f8b6804f58399429dd3fd52bcf634109ee Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 10 Sep 2024 08:58:42 -0500
Subject: [PATCH 3/4] reduced the max number of blocks for each transpose

---
 lib/gpu/lal_neighbor.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index ba2a328130..101e92953c 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -586,19 +586,20 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const int b2y=_block_cell_2d;
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    // maximum number of blocks on the device
-    const int max_num_blocks = 65535;
+    // the maximum number of blocks on the device is typically 65535
+    // we can use a lower number to have more resource per block
+    const int max_num_blocks = 32768;
     int shift = 0;
     if (g2y < max_num_blocks) {
       _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
       _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
     } else {
       // using a fixed number of blocks
-      int g2y_m = 65534;
+      int g2y_m = max_num_blocks;
+      _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y);
       // number of chunks needed for the whole transpose
       const int num_chunks = ceil(static_cast<double>(g2y) / g2y_m);
-      for (int i = 0; i < num_chunks; i++) {
-        _shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y);
+      for (int i = 0; i < num_chunks; i++) {        
         _shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
         shift += g2y_m*b2y;
       }

From b16bb27184cb873ff9da8136d99f9043b6c19e91 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 10 Sep 2024 09:47:07 -0500
Subject: [PATCH 4/4] revert to using the max number of blocks on device for
 each pass, as the number of blocks (32767 or 65535) already saturates the
 number of SMs anyway

---
 lib/gpu/lal_neighbor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 101e92953c..051f55f0a3 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -587,8 +587,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
     // the maximum number of blocks on the device is typically 65535
-    // we can use a lower number to have more resource per block
-    const int max_num_blocks = 32768;
+    // in principle we can use a lower number to have more resource per block 32768
+    const int max_num_blocks = 65535;
     int shift = 0;
     if (g2y < max_num_blocks) {
       _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);