diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 1c0aa77706..38aa2bde27 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -278,14 +278,16 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* const int cus = this->device->gpu->cus(); - int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - while (GX < cus) { + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); } - + */ this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 8e4e8faf83..e6ffcd764a 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -727,14 +727,16 @@ int BaseAmoebaT::fphi_uind() { return 0; // Compute the block size and grid size to keep all cores busy - const int cus = device->gpu->cus(); - int BX=block_size(); - int GX=static_cast(ceil(static_cast(ainum)/BX)); - while (GX < cus) { + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/BX)); } - + */ time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); @@ -793,14 +795,16 @@ int BaseAmoebaT::fphi_mpole() { int nbor_pitch=nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy + + const int BX=block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/BX)); + /* const int cus = device->gpu->cus(); - int BX=block_size(); - int GX=static_cast(ceil(static_cast(ainum)/BX)); - while (GX < cus) { + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/BX)); } - + */ time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_mpole.set_size(GX,BX); diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index f20a0cfd62..d4366cac85 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -619,14 +619,16 @@ int HippoT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); // Compute the block size and grid size to keep all cores busy - const int cus = this->device->gpu->cus(); - int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - while (GX < cus) { + + const int BX=this->block_size(); + const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); + /* + const int cus = this->device->gpu->cus(); + while (GX < cus && GX > 1) { BX /= 2; GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); } - + */ this->time_pair.start(); // Build the short neighbor list if not done yet