diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h
index f809323ee7..d14631fa0f 100644
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@@ -81,6 +81,10 @@ class UCL_Device {
   /// Return the number of devices that support CUDA
   inline int num_devices() { return _properties.size(); }
 
+  /// Specify whether profiling (device timers) will be used for the device (yes=true)
+  /** No-op for CUDA and HIP **/
+  inline void configure_profiling(const bool profiling_on) {}
+  
   /// Set the CUDA device to the specified device number
   /** A context and default command queue will be created for the device
     * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 5e2444b4d1..5875dfaa7c 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -95,6 +95,10 @@ class UCL_Device {
   /// Return the number of devices that support CUDA
   inline int num_devices() { return _properties.size(); }
 
+  /// Specify whether profiling (device timers) will be used for the device (yes=true)
+  /** No-op for CUDA and HIP **/
+  inline void configure_profiling(const bool profiling_on) {}
+  
   /// Set the CUDA device to the specified device number
   /** A context and default command queue will be created for the device
     * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 092b4ad11b..6a563b5f47 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -125,6 +125,11 @@ class UCL_Device {
   /// Return the number of devices that support OpenCL
   inline int num_devices() { return _num_devices; }
 
+  /// Specify whether profiling (device timers) will be used for the device (yes=true)
+  /** No-op for CUDA and HIP **/
+  inline void configure_profiling(const bool profiling_on)
+    { _cq_profiling = profiling_on; }
+
   /// Set the OpenCL device to the specified device number
   /** A context and default command queue will be created for the device *
     * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
@@ -169,10 +174,22 @@ class UCL_Device {
     _cq.push_back(cl_command_queue());
 
 #ifdef CL_VERSION_2_0
-    cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-    _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props, &errorv);
+    if (_cq_profiling) {
+      cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE,
+                                     0};
+      _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
+                                                    &errorv);
+    } else {
+      cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0};
+      _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
+                                                    &errorv);
+    }
 #else
-    _cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE, &errorv);
+    if (_cq_profiling)
+      _cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE,
+                                      &errorv);
+    else
+      _cq.back()=clCreateCommandQueue(_context, _cl_device, 0, &errorv);
 #endif
     if (errorv!=CL_SUCCESS) {
       std::cerr << "Could not create command queue on device: " << name()
@@ -370,6 +387,7 @@ class UCL_Device {
   cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
   cl_context _context;              // Context used for accessing the device
   std::vector<cl_command_queue> _cq;// The default command queue for this device
+  bool _cq_profiling;               // True=create command queues w/ profiling support 
   int _device;                            // UCL_Device ID for current device
   cl_device_id _cl_device;                // OpenCL ID for current device
   std::vector<cl_device_id> _cl_devices;  // OpenCL IDs for all devices
@@ -384,6 +402,7 @@ class UCL_Device {
 // Grabs the properties for all devices
 UCL_Device::UCL_Device() {
   _device=-1;
+  _cq_profiling=true;
 
   // --- Get Number of Platforms
   cl_uint nplatforms;
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 25b20beea5..bd77170ed9 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -50,11 +50,15 @@ class UCL_Timer {
   /** \note init() must be called to reuse timer after a clear() **/
   inline void clear() {
     if (_initialized) {
+      if (has_measured_time) {
+        clReleaseEvent(start_event);
+        clReleaseEvent(stop_event);
+        has_measured_time = false;
+      }
       CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
       _initialized=false;
       _total_time=0.0;
     }
-    has_measured_time = false;
   }
 
   /// Initialize default command queue for timing
@@ -71,8 +75,12 @@ class UCL_Timer {
 
   /// Start timing on default command queue
   inline void start() {
+    if (has_measured_time) {
+      clReleaseEvent(start_event);
+      clReleaseEvent(stop_event);
+      has_measured_time = false;
+    }
     UCL_OCL_MARKER(_cq,&start_event);
-    has_measured_time = false;
   }
 
   /// Stop timing on default command queue
@@ -83,8 +91,12 @@ class UCL_Timer {
 
   /// Block until the start event has been reached on device
   inline void sync_start() {
+    if (has_measured_time) {
+      clReleaseEvent(start_event);
+      clReleaseEvent(stop_event);
+      has_measured_time = false;
+    }
     CL_SAFE_CALL(clWaitForEvents(1,&start_event));
-    has_measured_time = false;
   }
 
   /// Block until the stop event has been reached on device
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 580a3e81b6..5438df2a78 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -265,15 +265,13 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
   // Time on the device only if 1 proc per gpu
   _time_device=true;
 
-#if 0
-  // XXX: the following setting triggers a memory leak with OpenCL and MPI
-  //      setting _time_device=true for all processes doesn't seem to be a
-  //      problem with either (no segfault, no (large) memory leak.
-  //      thus keeping this disabled for now. may need to review later.
-  //      2018-07-23 <akohlmey@gmail.com>
+  // Previous source of OCL memory leak when time_device=false
+  // - Logic added to release OCL events when timers are not invoked
   if (_procs_per_gpu>1)
     _time_device=false;
-#endif
+
+  if (!_time_device && _particle_split > 0)
+    gpu->configure_profiling(false);
 
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
@@ -715,7 +713,9 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       dev_data_out[0].flush();
     #endif
     driver_time=MPI_Wtime()-driver_time;
-    double time=over_timer.seconds();
+    double time=0.0;
+    if (_time_device)
+      time=over_timer.seconds();
 
     if (time_device()) {
       for (int i=0; i<_data_in_estimate; i++)
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 8b3a6b1dde..39249ea350 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -304,7 +304,8 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
                            const double delxinv, const double delyinv,
                            const double delzinv) {
   if (!_precompute_done) {
-    atom->acc_timers();
+    if (device->time_device())
+      atom->acc_timers();
     _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
                 delyinv,delzinv);
   }