Misc Improvements to GPU Package

- Optimizations for molecular systems
-   Improved kernel performance and greater CPU overlap
- Reduced GPU to CPU communications for discrete devices
- Switch classic Intel makefiles to use LLVM-based compilers
- Prefetch optimizations supported for OpenCL
- Optimized data repack for quaternions
This commit is contained in:
W. Michael Brown
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions

View File

@ -370,7 +370,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
_ocl_config_name="CUSTOM";
int token_count=0;
std::string params[18];
std::string params[19];
char ocl_config[2048];
strncpy(ocl_config,s_config.c_str(),2047);
char *pch = strtok(ocl_config,",");
@ -378,7 +378,7 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
pch = strtok(nullptr,",");
if (pch == nullptr) return -11;
while (pch != nullptr) {
if (token_count==18)
if (token_count==19)
return -11;
params[token_count]=pch;
token_count++;
@ -389,6 +389,16 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
#ifdef CL_VERSION_2_0
_ocl_compile_string+="-cl-std=CL2.0 ";
#endif
if (params[0]=="500") {
_ocl_compile_string+="-DINTEL_OCL ";
#ifdef _DOUBLE_DOUBLE
// workaround for double precision with Intel OpenCL
params[4]="0";
#endif
}
#ifdef LAL_DISABLE_PREFETCH
params[18]="0";
#endif
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
std::string(OCL_PRECISION_COMPILE);
@ -421,7 +431,8 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
" -DMAX_SHARED_TYPES="+params[15]+
" -DMAX_BIO_SHARED_TYPES="+params[16]+
" -DPPPM_MAX_SPLINE="+params[17];
" -DPPPM_MAX_SPLINE="+params[17]+
" -DNBOR_PREFETCH="+params[18];
_ocl_compile_string += extra_args;
#endif
return 0;
@ -558,7 +569,11 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
return -3;
if (_user_cell_size<0.0) {
#ifndef LAL_USE_OLD_NEIGHBOR
_neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
#else
_neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
#endif
} else
_neighbor_shared.setup_auto_cell_size(false,_user_cell_size,nbor->simd_size());
nbor->set_cutoff(cutoff);
@ -954,7 +969,7 @@ int DeviceT::compile_kernels() {
k_info.set_function(*dev_program,"kernel_info");
_compiled=true;
UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED);
UCL_Vector<int,int> gpu_lib_data(20,*gpu,UCL_NOT_PINNED);
k_info.set_size(1,1);
k_info.run(&gpu_lib_data);
gpu_lib_data.update_host(false);