From 71464d831428bf048bdf225008558be01eb249b0 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <michael.w.brown@intel.com>
Date: Wed, 28 Sep 2022 22:30:09 -0700
Subject: [PATCH] GPU Package: Fixing logic in OpenCL backend that could result
 in unnecessary device allocations.

---
 lib/gpu/geryon/ocl_device.h  |  9 ++++++---
 lib/gpu/geryon/ocl_memory.h  | 38 +++++++++++++++++++++++-------------
 lib/gpu/geryon/ucl_basemat.h | 14 ++++++++++---
 lib/gpu/geryon/ucl_h_mat.h   |  5 ++++-
 lib/gpu/geryon/ucl_h_vec.h   |  5 ++++-
 5 files changed, 49 insertions(+), 22 deletions(-)
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 6a563b5f47..ceb0ded745 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -99,6 +99,7 @@ struct OCLProperties {
   int cl_device_version;
   bool has_subgroup_support;
   bool has_shuffle_support;
+  bool shared_main_memory;
 };
 
 /// Class for looking at data parallel device properties
@@ -226,7 +227,7 @@ class UCL_Device {
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i)
-    { return _shared_mem_device(_cl_devices[i]); }
+    { return _properties[i].shared_main_memory; }
 
   /// Returns preferred vector width
   inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
@@ -582,8 +583,9 @@ void UCL_Device::add_properties(cl_device_id device_list) {
   op.preferred_vector_width64=double_width;
 
   // Determine if double precision is supported: All bits in the mask must be set.
-  cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO|
-                                     CL_FP_ROUND_TO_INF|CL_FP_INF_NAN|CL_FP_DENORM);
+  cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST|
+                                     CL_FP_ROUND_TO_ZERO|CL_FP_ROUND_TO_INF|
+                                     CL_FP_INF_NAN|CL_FP_DENORM);
   cl_device_fp_config double_avail;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_DOUBLE_FP_CONFIG,
                                sizeof(double_avail),&double_avail,nullptr));
@@ -684,6 +686,7 @@ void UCL_Device::add_properties(cl_device_id device_list) {
     double arch = static_cast<double>(minor)/10+major;
     if (arch >= 3.0)
       op.has_shuffle_support=true;
+    op.shared_main_memory=_shared_mem_device(device_list);
   }
   delete[] buffer2;
   #endif
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index adf7b6c952..bfc260889a 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -118,15 +118,19 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
 template <class mat_type, class copy_type>
 inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
                       const size_t n) {
-  cl_int error_flag;
-  cl_buffer_region subbuffer;
-  subbuffer.origin = o;
-  subbuffer.size = n;
-  mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
-                                 CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
-                                 &error_flag);
-
-  CL_CHECK_ERR(error_flag);
+  // When viewing outside host allocation with discrete main memory on accelerator,
+  // no cl_buffer object is created to avoid unnecessary creation of device allocs
+  if (cm.shared_mem_device()) {
+    cl_int error_flag;
+    cl_buffer_region subbuffer;
+    subbuffer.origin = o;
+    subbuffer.size = n;
+    mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
+                                   CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
+                                   &error_flag);
+    CL_CHECK_ERR(error_flag);
+  } else
+    mat.cbegin()=(cl_mem)0;
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
   return UCL_SUCCESS;
 }
@@ -170,10 +174,13 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
 
 template <class mat_type>
 inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
-  cl_int error_flag;
-  mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
-                              n,*mat.host_ptr(),&error_flag);
-  CL_CHECK_ERR(error_flag);
+  if (mat.shared_mem_device()) {
+    cl_int error_flag;
+    mat.cbegin()=clCreateBuffer(dev.context(), CL_MEM_USE_HOST_PTR,
+                                n,*mat.host_ptr(),&error_flag);
+    CL_CHECK_ERR(error_flag);
+  } else
+    mat.cbegin()=(cl_mem)0;
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
   return UCL_SUCCESS;
 }
@@ -181,7 +188,10 @@ inline int _host_view(mat_type &mat, UCL_Device &dev, const size_t n) {
 template <class mat_type>
 inline void _host_free(mat_type &mat) {
   if (mat.cols()>0) {
-    CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
+    // When viewing outside host allocation with discrete main memory on accelerator,
+    // no cl_buffer object is created to avoid unnecessary creation of device allocs
+    if (mat.cbegin()!=(cl_mem)(0))
+      CL_DESTRUCT_CALL(clReleaseMemObject(mat.cbegin()));
     CL_DESTRUCT_CALL(clReleaseCommandQueue(mat.cq()));
   }
 }
diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h
index 51fd33d623..3f478c4b4e 100644
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@@ -75,13 +75,21 @@ class UCL_BaseMat {
   inline enum UCL_MEMOPT kind() const { return _kind; }
 
   inline bool shared_mem_device() {
-    #ifdef _OCL_MAT
+    #ifndef _OCL_MAT
+    return false;
+    #else
+    
+    #if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON)
+    return true;
+    #elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF)
+    return false;
+    #else
     cl_device_id device;
     CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
                                        sizeof(cl_device_id),&device,NULL));
     return _shared_mem_device(device);
-    #else
-    return false;
+    #endif
+    
     #endif
   }
 
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index 41dad2b285..082cfd5980 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -140,7 +140,10 @@ class UCL_H_Mat : public UCL_BaseMat {
     _end=_array+_cols;
     #ifdef _OCL_MAT
     _carray=input.cbegin();
-    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    // When viewing outside host allocation with discrete main memory on accelerator,
+    // no cl_buffer object is created to avoid unnecessary creation of device allocs
+    if (_carray!=(cl_mem)(0))
+      CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
     CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #endif
   }
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index d9ce0bbba6..2f49f9f633 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -139,7 +139,10 @@ class UCL_H_Vec : public UCL_BaseMat {
     _end=_array+_cols;
     #ifdef _OCL_MAT
     _carray=input.cbegin();
-    CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
+    // When viewing outside host allocation with discrete main memory on accelerator,
+    // no cl_buffer object is created to avoid unnecessary creation of device allocs
+    if (_carray!=(cl_mem)(0))
+      CL_SAFE_CALL(clRetainMemObject(input.cbegin()));
     CL_SAFE_CALL(clRetainCommandQueue(input.cq()));
     #endif
   }