From 71464d831428bf048bdf225008558be01eb249b0 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Wed, 28 Sep 2022 22:30:09 -0700 Subject: [PATCH] GPU Package: Fixing logic in OpenCL backend that could result in unnecessary device allocations. --- lib/gpu/geryon/ocl_device.h | 9 ++++++--- lib/gpu/geryon/ocl_memory.h | 38 +++++++++++++++++++++++------------- lib/gpu/geryon/ucl_basemat.h | 14 ++++++++++--- lib/gpu/geryon/ucl_h_mat.h | 5 ++++- lib/gpu/geryon/ucl_h_vec.h | 5 ++++- 5 files changed, 49 insertions(+), 22 deletions(-) diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 6a563b5f47..ceb0ded745 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -99,6 +99,7 @@ struct OCLProperties { int cl_device_version; bool has_subgroup_support; bool has_shuffle_support; + bool shared_main_memory; }; /// Class for looking at data parallel device properties @@ -226,7 +227,7 @@ class UCL_Device { inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory(const int i) - { return _shared_mem_device(_cl_devices[i]); } + { return _properties[i].shared_main_memory; } /// Returns preferred vector width inline int preferred_fp32_width() { return preferred_fp32_width(_device); } @@ -582,8 +583,9 @@ void UCL_Device::add_properties(cl_device_id device_list) { op.preferred_vector_width64=double_width; // Determine if double precision is supported: All bits in the mask must be set. - cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO| - CL_FP_ROUND_TO_INF|CL_FP_INF_NAN|CL_FP_DENORM); + cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST| + CL_FP_ROUND_TO_ZERO|CL_FP_ROUND_TO_INF| + CL_FP_INF_NAN|CL_FP_DENORM); cl_device_fp_config double_avail; CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(double_avail),&double_avail,nullptr)); @@ -684,6 +686,7 @@ void UCL_Device::add_properties(cl_device_id device_list) { double arch = static_cast(minor)/10+major; if (arch >= 3.0) op.has_shuffle_support=true; + op.shared_main_memory=_shared_mem_device(device_list); } delete[] buffer2; #endif diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h index adf7b6c952..bfc260889a 100644 --- a/lib/gpu/geryon/ocl_memory.h +++ b/lib/gpu/geryon/ocl_memory.h @@ -118,15 +118,19 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, template inline int _host_view(mat_type &mat, copy_type &cm, const size_t o, const size_t n) { - cl_int error_flag; - cl_buffer_region subbuffer; - subbuffer.origin = o; - subbuffer.size = n; - mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0, - CL_BUFFER_CREATE_TYPE_REGION, &subbuffer, - &error_flag); - - CL_CHECK_ERR(error_flag); + // When viewing outside host allocation with discrete main memory on accelerator, + // no cl_buffer object is created to avoid unnecessary creation of device allocs + if (cm.shared_mem_device()) { + cl_int error_flag; + cl_buffer_region subbuffer; + subbuffer.origin = o; + subbuffer.size = n; + mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0, + CL_BUFFER_CREATE_TYPE_REGION, &subbuffer, + &error_flag); + CL_CHECK_ERR(error_flag); + } else + mat.cbegin()=(cl_mem)0; CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); return UCL_SUCCESS; } @@ -170,10 +174,13 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, template inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) { - cl_int error_flag; - mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR, - n,*mat.host_ptr(),&error_flag); - CL_CHECK_ERR(error_flag); + if (mat.shared_mem_device()) { + cl_int error_flag; + mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR, + n,*mat.host_ptr(),&error_flag); + CL_CHECK_ERR(error_flag); + } else + mat.cbegin()=(cl_mem)0; CL_SAFE_CALL(clRetainCommandQueue(mat.cq())); return UCL_SUCCESS; } @@ -181,7 +188,10 @@ inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) { template inline void _host_free(mat_type &mat) { if (mat.cols()>0) { - CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); + // When viewing outside host allocation with discrete main memory on accelerator, + // no cl_buffer object is created to avoid unnecessary creation of device allocs + if (mat.cbegin()!=(cl_mem)(0)) + CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin())); CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq())); } } diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h index 51fd33d623..3f478c4b4e 100644 --- a/lib/gpu/geryon/ucl_basemat.h +++ b/lib/gpu/geryon/ucl_basemat.h @@ -75,13 +75,21 @@ class UCL_BaseMat { inline enum UCL_MEMOPT kind() const { return _kind; } inline bool shared_mem_device() { - #ifdef _OCL_MAT + #ifndef _OCL_MAT + return false; + #else + + #if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON) + return true; + #elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF) + return false; + #else cl_device_id device; CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE, sizeof(cl_device_id),&device,NULL)); return _shared_mem_device(device); - #else - return false; + #endif + #endif } diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h index 41dad2b285..082cfd5980 100644 --- a/lib/gpu/geryon/ucl_h_mat.h +++ b/lib/gpu/geryon/ucl_h_mat.h @@ -140,7 +140,10 @@ class UCL_H_Mat : public UCL_BaseMat { _end=_array+_cols; #ifdef _OCL_MAT _carray=input.cbegin(); - CL_SAFE_CALL(clRetainMemObject(input.cbegin())); + // When viewing outside host allocation with discrete main memory on accelerator, + // no cl_buffer object is created to avoid unnecessary creation of device allocs + if (_carray!=(cl_mem)(0)) + CL_SAFE_CALL(clRetainMemObject(input.cbegin())); CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #endif } diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h index d9ce0bbba6..2f49f9f633 100644 --- a/lib/gpu/geryon/ucl_h_vec.h +++ b/lib/gpu/geryon/ucl_h_vec.h @@ -139,7 +139,10 @@ class UCL_H_Vec : public UCL_BaseMat { _end=_array+_cols; #ifdef _OCL_MAT _carray=input.cbegin(); - CL_SAFE_CALL(clRetainMemObject(input.cbegin())); + // When viewing outside host allocation with discrete main memory on accelerator, + // no cl_buffer object is created to avoid unnecessary creation of device allocs + if (_carray!=(cl_mem)(0)) + CL_SAFE_CALL(clRetainMemObject(input.cbegin())); CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #endif }