Misc Improvements to GPU Package

- Optimizations for molecular systems
-   Improved kernel performance and greater CPU overlap
- Reduced GPU to CPU communications for discrete devices
- Switch classic Intel makefiles to use LLVM-based compilers
- Prefetch optimizations supported for OpenCL
- Optimized data repack for quaternions
This commit is contained in:
W. Michael Brown
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions

View File

@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
_max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
_tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_fieldp_size = _max_tep_size;
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_thetai_size = 0;
@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
if (inum_full>_max_tep_size) {
_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_tep.resize(_max_tep_size*4);
_tep.resize(_max_tep_size*3);
}
*tep_ptr=_tep.host.begin();
@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
_tep.update_host(_max_tep_size*3,false);
}
// ---------------------------------------------------------------------------
@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
// copy field and fieldp from device to host (_fieldp store both arrays, one after another)
_fieldp.update_host(_max_fieldp_size*8,false);
_fieldp.update_host(_max_fieldp_size*6,false);
}
// ---------------------------------------------------------------------------
@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
// NOTE: move this step to update_fieldp() to delay device-host transfer
// after umutual1 and self are done on the GPU
// *fieldp_ptr=_fieldp.host.begin();
// _fieldp.update_host(_max_fieldp_size*8,false);
// _fieldp.update_host(_max_fieldp_size*6,false);
}
// ---------------------------------------------------------------------------
@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
device->add_ans_object(ans);
// copy tep from device to host
_tep.update_host(_max_tep_size*4,false);
_tep.update_host(_max_tep_size*3,false);
}
// ---------------------------------------------------------------------------