fixed the value of shift being the number of rows processed in each chunk (g2y_m * b2y)

This commit is contained in:
Trung Nguyen
2024-09-10 00:36:13 -05:00
parent 23bdc5ddc2
commit f398add7d5

View File

@ -586,21 +586,24 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
const int b2y=_block_cell_2d;
const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
// maximum number of blocks on the device
const int max_num_blocks = 65535;
int shift = 0;
if (g2y < max_num_blocks) {
_shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
} else {
const int num_rounds = ceil(static_cast<double>(g2y) / max_num_blocks);
// using a fixed number of blocks
int g2y_m = 65534;
for (int i = 0; i < num_rounds; i++) {
// number of chunks needed for the whole transpose
const int num_chunks = ceil(static_cast<double>(g2y) / g2y_m);
for (int i = 0; i < num_chunks; i++) {
_shared->k_transpose.set_size(g2x,g2y_m,b2x,b2y);
_shared->k_transpose.run(&dev_special,&dev_special_t,&_maxspecial,&nt,&shift);
shift += g2y_m;
shift += g2y_m*b2y;
}
}
time_transpose.stop();
}