GPU Package: Adding JIT test for OpenCL prefetch support.

This commit is contained in:
W. Michael Brown
2023-03-07 21:43:19 -08:00
parent d7c783560a
commit c96ac858bf
3 changed files with 47 additions and 1 deletions

View File

@ -26,6 +26,22 @@
#if defined(USE_OPENCL)
#include "device_cl.h"
const char *ocl_prefetch_test =
" #if (NBOR_PREFETCH == 1) \n"\
" inline void ucl_prefetch(const __global int *p) { prefetch(p, 1); } \n"\
" #else \n"\
" enum LSC_LDCC {LSC_LDCC_DEFAULT, LSC_LDCC_L1UC_L3UC, LSC_LDCC_L1UC_L3C, \n"\
" LSC_LDCC_L1C_L3UC, LSC_LDCC_L1C_L3C, LSC_LDCC_L1S_L3UC, \n"\
" LSC_LDCC_L1S_L3C, LSC_LDCC_L1IAR_L3C, }; \n"\
" void __builtin_IB_lsc_prefetch_global_uint(const __global uint *, int, \n"\
" enum LSC_LDCC); \n"\
" inline void ucl_prefetch(const __global int *p) { \n"\
" __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0, \n"\
" LSC_LDCC_L1C_L3UC); \n"\
" } \n"\
" #endif \n"\
" __kernel void ptest(__global int *i) { ucl_prefetch(i); i[0]++; } \n";
#ifdef LAL_OCL_EXTRA_ARGS
#define LAL_DM_STRINGIFY(x) #x
#define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x)
@ -396,9 +412,31 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
params[4]="0";
#endif
}
// Test OCL JIT to make sure any prefetch options are supported
#ifdef LAL_DISABLE_PREFETCH
params[18]="0";
#endif
_nbor_prefetch=-1;
if (params[18]=="2") {
_nbor_prefetch=2;
UCL_Program ptest(*gpu);
std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18];
int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(),
nullptr,nullptr,1);
if (success!=UCL_SUCCESS) params[18]="1";
}
if (params[18]=="1") {
_nbor_prefetch=1;
UCL_Program ptest(*gpu);
std::string ptest_args=_ocl_compile_string+" -DNBOR_PREFETCH="+params[18];
int success=ptest.load_string(ocl_prefetch_test,ptest_args.c_str(),
nullptr,nullptr,1);
if (success!=UCL_SUCCESS) params[18]="0";
}
if (_nbor_prefetch<0) params[18]="0";
if (params[18]=="0") _nbor_prefetch=0;
if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
_ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
std::string(OCL_PRECISION_COMPILE);
@ -844,6 +882,10 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
fprintf(screen,"Average split: %.4f.\n",avg_split);
fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom);
fprintf(screen,"Vector width: %d.\n", simd_size());
fprintf(screen,"Prefetch mode: ");
if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n");
else if (_nbor_prefetch==1) fprintf(screen,"API.\n");
else fprintf(screen,"None.\n");
fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb);
if (nbor.gpu_nbor()==2)
fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size);