GPU Package: Adding JIT test for OpenCL prefetch support.

This commit is contained in:
W. Michael Brown
2023-03-07 21:43:19 -08:00
parent d7c783560a
commit c96ac858bf
3 changed files with 47 additions and 1 deletions

View File

@ -95,7 +95,8 @@ class UCL_Program {
/// Load a program from a string and compile with flags
inline int load_string(const void *program, const char *flags="",
std::string *log=nullptr, FILE* foutput=nullptr) {
std::string *log=nullptr, FILE* foutput=nullptr,
const int compile_test=0) {
cl_int error_flag;
const char *prog=(const char *)program;
_program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag);
@ -131,6 +132,8 @@ class UCL_Program {
}
#endif
if (build_status != CL_SUCCESS && compile_test) return UCL_COMPILE_ERROR;
if (build_status != CL_SUCCESS || log!=NULL) {
size_t ms;
CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,

View File

@ -26,6 +26,22 @@
#if defined(USE_OPENCL)
#include "device_cl.h"
const char *ocl_prefetch_test =
" #if (NBOR_PREFETCH == 1) \n"\
" inline void ucl_prefetch(const __global int *p) { prefetch(p, 1); } \n"\
" #else \n"\
" enum LSC_LDCC {LSC_LDCC_DEFAULT, LSC_LDCC_L1UC_L3UC, LSC_LDCC_L1UC_L3C, \n"\
" LSC_LDCC_L1C_L3UC, LSC_LDCC_L1C_L3C, LSC_LDCC_L1S_L3UC, \n"\
" LSC_LDCC_L1S_L3C, LSC_LDCC_L1IAR_L3C, }; \n"\
" void __builtin_IB_lsc_prefetch_global_uint(const __global uint *, int, \n"\
" enum LSC_LDCC); \n"\
" inline void ucl_prefetch(const __global int *p) { \n"\
" __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0, \n"\
" LSC_LDCC_L1C_L3UC); \n"\
" } \n"\
" #endif \n"\
" __kernel void ptest(__global int *i) { ucl_prefetch(i); i[0]++; } \n";
#ifdef LAL_OCL_EXTRA_ARGS
#define LAL_DM_STRINGIFY(x) #x
#define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x)
@ -396,9 +412,31 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
params[4]="0";
#endif
}
// Test OCL JIT to make sure any prefetch options are supported
#ifdef LAL_DISABLE_PREFETCH
params[18]="0";
#endif
_nbor_prefetch=-1;
if (params[18]=="2") {
_nbor_prefetch=2;
UCL_Program ptest(*gpu);
std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18];
int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(),
nullptr,nullptr,1);
if (success!=UCL_SUCCESS) params[18]="1";
}
if (params[18]=="1") {
_nbor_prefetch=1;
UCL_Program ptest(*gpu);
std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18];
int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(),
nullptr,nullptr,1);
if (success!=UCL_SUCCESS) params[18]="0";
}
if (_nbor_prefetch<0) params[18]="0";
if (params[18]=="0") _nbor_prefetch=0;
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
std::string(OCL_PRECISION_COMPILE);
@ -844,6 +882,10 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom);
fprintf(screen,"Vector width: %d.\n", simd_size());
fprintf(screen,"Prefetch mode: ");
if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n");
else if (_nbor_prefetch==1) fprintf(screen,"API.\n");
else fprintf(screen,"None.\n");
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
if (nbor.gpu_nbor()==2)
fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size);

View File

@ -346,6 +346,7 @@ class Device {
int _block_pair, _block_bio_pair, _block_ellipse;
int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id;
int _max_shared_types, _max_bio_shared_types, _pppm_max_spline;
int _nbor_prefetch;
UCL_Program *dev_program;
UCL_Kernel k_zero, k_info;