diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h index 14a319f391..7b7fca9dfc 100644 --- a/lib/gpu/geryon/ocl_kernel.h +++ b/lib/gpu/geryon/ocl_kernel.h @@ -95,7 +95,8 @@ class UCL_Program { /// Load a program from a string and compile with flags inline int load_string(const void *program, const char *flags="", - std::string *log=nullptr, FILE* foutput=nullptr) { + std::string *log=nullptr, FILE* foutput=nullptr, + const int compile_test=0) { cl_int error_flag; const char *prog=(const char *)program; _program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag); @@ -131,6 +132,8 @@ class UCL_Program { } #endif + if (build_status != CL_SUCCESS && compile_test) return UCL_COMPILE_ERROR; + if (build_status != CL_SUCCESS || log!=NULL) { size_t ms; CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG, diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index af53572590..cbf3f5f885 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -26,6 +26,22 @@ #if defined(USE_OPENCL) #include "device_cl.h" +const char *ocl_prefetch_test = +" #if (NBOR_PREFETCH == 1) \n"\ +" inline void ucl_prefetch(const __global int *p) { prefetch(p, 1); } \n"\ +" #else \n"\ +" enum LSC_LDCC {LSC_LDCC_DEFAULT, LSC_LDCC_L1UC_L3UC, LSC_LDCC_L1UC_L3C, \n"\ +" LSC_LDCC_L1C_L3UC, LSC_LDCC_L1C_L3C, LSC_LDCC_L1S_L3UC, \n"\ +" LSC_LDCC_L1S_L3C, LSC_LDCC_L1IAR_L3C, }; \n"\ +" void __builtin_IB_lsc_prefetch_global_uint(const __global uint *, int, \n"\ +" enum LSC_LDCC); \n"\ +" inline void ucl_prefetch(const __global int *p) { \n"\ +" __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0, \n"\ +" LSC_LDCC_L1C_L3UC); \n"\ +" } \n"\ +" #endif \n"\ +" __kernel void ptest(__global int *i) { ucl_prefetch(i); i[0]++; } \n"; + #ifdef LAL_OCL_EXTRA_ARGS #define LAL_DM_STRINGIFY(x) #x #define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x) @@ -396,9 +412,31 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args) params[4]="0"; #endif } + + // Test OCL JIT to make sure any prefetch options are supported #ifdef LAL_DISABLE_PREFETCH params[18]="0"; #endif + _nbor_prefetch=-1; + if (params[18]=="2") { + _nbor_prefetch=2; + UCL_Program ptest(*gpu); + std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18]; + int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(), + nullptr,nullptr,1); + if (success!=UCL_SUCCESS) params[18]="1"; + } + if (params[18]=="1") { + _nbor_prefetch=1; + UCL_Program ptest(*gpu); + std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18]; + int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(), + nullptr,nullptr,1); + if (success!=UCL_SUCCESS) params[18]="0"; + } + if (_nbor_prefetch<0) params[18]="0"; + if (params[18]=="0") _nbor_prefetch=0; + if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math "; _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+ std::string(OCL_PRECISION_COMPILE); @@ -844,6 +882,10 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); fprintf(screen,"Vector width: %d.\n", simd_size()); + fprintf(screen,"Prefetch mode: "); + if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n"); + else if (_nbor_prefetch==1) fprintf(screen,"API.\n"); + else fprintf(screen,"None.\n"); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); if (nbor.gpu_nbor()==2) fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size); diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 3b27223007..ba693e551a 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -346,6 +346,7 @@ class Device { int _block_pair, _block_bio_pair, _block_ellipse; int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id; int _max_shared_types, _max_bio_shared_types, _pppm_max_spline; + int _nbor_prefetch; UCL_Program *dev_program; UCL_Kernel k_zero, k_info;