diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h index f809323ee7..d14631fa0f 100644 --- a/lib/gpu/geryon/hip_device.h +++ b/lib/gpu/geryon/hip_device.h @@ -81,6 +81,10 @@ class UCL_Device { /// Return the number of devices that support CUDA inline int num_devices() { return _properties.size(); } + /// Specify whether profiling (device timers) will be used for the device (yes=true) + /** No-op for CUDA and HIP **/ + inline void configure_profiling(const bool profiling_on) {} + /// Set the CUDA device to the specified device number /** A context and default command queue will be created for the device * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 5e2444b4d1..5875dfaa7c 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -95,6 +95,10 @@ class UCL_Device { /// Return the number of devices that support CUDA inline int num_devices() { return _properties.size(); } + /// Specify whether profiling (device timers) will be used for the device (yes=true) + /** No-op for CUDA and HIP **/ + inline void configure_profiling(const bool profiling_on) {} + /// Set the CUDA device to the specified device number /** A context and default command queue will be created for the device * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 092b4ad11b..6a563b5f47 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -125,6 +125,11 @@ class UCL_Device { /// Return the number of devices that support OpenCL inline int num_devices() { return _num_devices; } + /// Specify whether profiling (device timers) will be used for the device (yes=true) + /** No-op for CUDA and HIP **/ + inline void configure_profiling(const bool profiling_on) + { _cq_profiling = profiling_on; } + /// Set the OpenCL device to the specified device number /** A context and default command queue will be created for the device * * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not @@ -169,10 +174,22 @@ class UCL_Device { _cq.push_back(cl_command_queue()); #ifdef CL_VERSION_2_0 - cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; - _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props, &errorv); + if (_cq_profiling) { + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, + 0}; + _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props, + &errorv); + } else { + cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0}; + _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props, + &errorv); + } #else - _cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE, &errorv); + if (_cq_profiling) + _cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE, + &errorv); + else + _cq.back()=clCreateCommandQueue(_context, _cl_device, 0, &errorv); #endif if (errorv!=CL_SUCCESS) { std::cerr << "Could not create command queue on device: " << name() @@ -370,6 +387,7 @@ class UCL_Device { cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms cl_context _context; // Context used for accessing the device std::vector _cq;// The default command queue for this device + bool _cq_profiling; // True=create command queues w/ profiling support int _device; // UCL_Device ID for current device cl_device_id _cl_device; // OpenCL ID for current device std::vector _cl_devices; // OpenCL IDs for all devices @@ -384,6 +402,7 @@ class UCL_Device { // Grabs the properties for all devices UCL_Device::UCL_Device() { _device=-1; + _cq_profiling=true; // --- Get Number of Platforms cl_uint nplatforms; diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h index 25b20beea5..bd77170ed9 100644 --- a/lib/gpu/geryon/ocl_timer.h +++ b/lib/gpu/geryon/ocl_timer.h @@ -50,11 +50,15 @@ class UCL_Timer { /** \note init() must be called to reuse timer after a clear() **/ inline void clear() { if (_initialized) { + if (has_measured_time) { + clReleaseEvent(start_event); + clReleaseEvent(stop_event); + has_measured_time = false; + } CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); _initialized=false; _total_time=0.0; } - has_measured_time = false; } /// Initialize default command queue for timing @@ -71,8 +75,12 @@ class UCL_Timer { /// Start timing on default command queue inline void start() { + if (has_measured_time) { + clReleaseEvent(start_event); + clReleaseEvent(stop_event); + has_measured_time = false; + } UCL_OCL_MARKER(_cq,&start_event); - has_measured_time = false; } /// Stop timing on default command queue @@ -83,8 +91,12 @@ class UCL_Timer { /// Block until the start event has been reached on device inline void sync_start() { + if (has_measured_time) { + clReleaseEvent(start_event); + clReleaseEvent(stop_event); + has_measured_time = false; + } CL_SAFE_CALL(clWaitForEvents(1,&start_event)); - has_measured_time = false; } /// Block until the stop event has been reached on device diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 580a3e81b6..5438df2a78 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -265,15 +265,13 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu, // Time on the device only if 1 proc per gpu _time_device=true; -#if 0 - // XXX: the following setting triggers a memory leak with OpenCL and MPI - // setting _time_device=true for all processes doesn't seem to be a - // problem with either (no segfault, no (large) memory leak. - // thus keeping this disabled for now. may need to review later. - // 2018-07-23 + // Previous source of OCL memory leak when time_device=false + // - Logic added to release OCL events when timers are not invoked if (_procs_per_gpu>1) _time_device=false; -#endif + + if (!_time_device && _particle_split > 0) + gpu->configure_profiling(false); // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); @@ -715,7 +713,9 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls, dev_data_out[0].flush(); #endif driver_time=MPI_Wtime()-driver_time; - double time=over_timer.seconds(); + double time=0.0; + if (_time_device) + time=over_timer.seconds(); if (time_device()) { for (int i=0; i<_data_in_estimate; i++) diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp index 8b3a6b1dde..39249ea350 100644 --- a/lib/gpu/lal_pppm.cpp +++ b/lib/gpu/lal_pppm.cpp @@ -304,7 +304,8 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall, const double delxinv, const double delyinv, const double delzinv) { if (!_precompute_done) { - atom->acc_timers(); + if (device->time_device()) + atom->acc_timers(); _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv, delyinv,delzinv); }