Reverted the block size tuning, which caused bugs for low atom counts (will revisit later)

2022-11-04 13:45:59 -05:00
parent 2f1f7ee0fa
commit a3cc0e8432
3 changed files with 26 additions and 18 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -278,14 +278,16 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();

  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
  const int cus = this->device->gpu->cus();
-  int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
-  while (GX < cus) {
+  while (GX < cus && GX > 1) {
    BX /= 2;
    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
  }
-
+  */
  this->time_pair.start();

  // Build the short neighbor list if not done yet
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -727,14 +727,16 @@ int BaseAmoebaT::fphi_uind() {
    return 0;

  // Compute the block size and grid size to keep all cores busy
-  const int cus = device->gpu->cus();
-  int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  while (GX < cus) {
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
    BX /= 2;
    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
  }
-  
+  */
  time_pair.start();
  int ngridxy = _ngridx * _ngridy;
  k_fphi_uind.set_size(GX,BX);
@ -793,14 +795,16 @@ int BaseAmoebaT::fphi_mpole() {
  int nbor_pitch=nbor->nbor_pitch();

  // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  /*
  const int cus = device->gpu->cus();
-  int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  while (GX < cus) {
+  while (GX < cus && GX > 1) {
    BX /= 2;
    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
  }
-
+  */
  time_pair.start();
  int ngridxy = _ngridx * _ngridy;
  k_fphi_mpole.set_size(GX,BX);
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -619,14 +619,16 @@ int HippoT::polar_real(const int eflag, const int vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();

  // Compute the block size and grid size to keep all cores busy
+  
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
  const int cus = this->device->gpu->cus();  
-  int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
-  while (GX < cus) {
+  while (GX < cus && GX > 1) {
    BX /= 2;
    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
  }
-
+  */
  this->time_pair.start();

  // Build the short neighbor list if not done yet