Misc Improvements to GPU Package

- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);

  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);

  _max_fieldp_size = _max_tep_size;
-  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);

  _max_thetai_size = 0;

@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,

  if (inum_full>_max_tep_size) {
    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _tep.resize(_max_tep_size*4);
+    _tep.resize(_max_tep_size*3);
  }
  *tep_ptr=_tep.host.begin();

@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,

  // copy tep from device to host

-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }

 // ---------------------------------------------------------------------------
@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double

  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)

-  _fieldp.update_host(_max_fieldp_size*8,false);
+  _fieldp.update_host(_max_fieldp_size*6,false);
 }

 // ---------------------------------------------------------------------------
@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
  // NOTE: move this step to update_fieldp() to delay device-host transfer
  //       after umutual1 and self are done on the GPU
  // *fieldp_ptr=_fieldp.host.begin();
-  // _fieldp.update_host(_max_fieldp_size*8,false);
+  // _fieldp.update_host(_max_fieldp_size*6,false);
 }

 // ---------------------------------------------------------------------------
@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
  device->add_ans_object(ans);

  // copy tep from device to host
-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_tep_size*3,false);
 }

 // ---------------------------------------------------------------------------