GPU Package: Switching back to timer disabling with multiple MPI tasks per GPU. Logic added to prevent mem leak.

This commit is contained in:
W. Michael Brown
2022-09-28 21:02:16 -07:00
parent be98b5a168
commit 6e34d21b24
6 changed files with 55 additions and 15 deletions

View File

@ -81,6 +81,10 @@ class UCL_Device {
/// Return the number of devices that support CUDA /// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); } inline int num_devices() { return _properties.size(); }
/// Specify whether profiling (device timers) will be used for the device (yes=true)
/** No-op for CUDA and HIP **/
inline void configure_profiling(const bool profiling_on) {}
/// Set the CUDA device to the specified device number /// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device /** A context and default command queue will be created for the device
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not

View File

@ -95,6 +95,10 @@ class UCL_Device {
/// Return the number of devices that support CUDA /// Return the number of devices that support CUDA
inline int num_devices() { return _properties.size(); } inline int num_devices() { return _properties.size(); }
/// Specify whether profiling (device timers) will be used for the device (yes=true)
/** No-op for CUDA and HIP **/
inline void configure_profiling(const bool profiling_on) {}
/// Set the CUDA device to the specified device number /// Set the CUDA device to the specified device number
/** A context and default command queue will be created for the device /** A context and default command queue will be created for the device
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not

View File

@ -125,6 +125,11 @@ class UCL_Device {
/// Return the number of devices that support OpenCL /// Return the number of devices that support OpenCL
inline int num_devices() { return _num_devices; } inline int num_devices() { return _num_devices; }
/// Specify whether profiling (device timers) will be used for the device (yes=true)
/** No-op for CUDA and HIP **/
inline void configure_profiling(const bool profiling_on)
{ _cq_profiling = profiling_on; }
/// Set the OpenCL device to the specified device number /// Set the OpenCL device to the specified device number
/** A context and default command queue will be created for the device * /** A context and default command queue will be created for the device *
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
@ -169,10 +174,22 @@ class UCL_Device {
_cq.push_back(cl_command_queue()); _cq.push_back(cl_command_queue());
#ifdef CL_VERSION_2_0 #ifdef CL_VERSION_2_0
cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; if (_cq_profiling) {
_cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props, &errorv); cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE,
0};
_cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
&errorv);
} else {
cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0};
_cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
&errorv);
}
#else #else
_cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE, &errorv); if (_cq_profiling)
_cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE,
&errorv);
else
_cq.back()=clCreateCommandQueue(_context, _cl_device, 0, &errorv);
#endif #endif
if (errorv!=CL_SUCCESS) { if (errorv!=CL_SUCCESS) {
std::cerr << "Could not create command queue on device: " << name() std::cerr << "Could not create command queue on device: " << name()
@ -370,6 +387,7 @@ class UCL_Device {
cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
cl_context _context; // Context used for accessing the device cl_context _context; // Context used for accessing the device
std::vector<cl_command_queue> _cq;// The default command queue for this device std::vector<cl_command_queue> _cq;// The default command queue for this device
bool _cq_profiling; // True=create command queues w/ profiling support
int _device; // UCL_Device ID for current device int _device; // UCL_Device ID for current device
cl_device_id _cl_device; // OpenCL ID for current device cl_device_id _cl_device; // OpenCL ID for current device
std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices
@ -384,6 +402,7 @@ class UCL_Device {
// Grabs the properties for all devices // Grabs the properties for all devices
UCL_Device::UCL_Device() { UCL_Device::UCL_Device() {
_device=-1; _device=-1;
_cq_profiling=true;
// --- Get Number of Platforms // --- Get Number of Platforms
cl_uint nplatforms; cl_uint nplatforms;

View File

@ -50,11 +50,15 @@ class UCL_Timer {
/** \note init() must be called to reuse timer after a clear() **/ /** \note init() must be called to reuse timer after a clear() **/
inline void clear() { inline void clear() {
if (_initialized) { if (_initialized) {
if (has_measured_time) {
clReleaseEvent(start_event);
clReleaseEvent(stop_event);
has_measured_time = false;
}
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq)); CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
_initialized=false; _initialized=false;
_total_time=0.0; _total_time=0.0;
} }
has_measured_time = false;
} }
/// Initialize default command queue for timing /// Initialize default command queue for timing
@ -71,9 +75,13 @@ class UCL_Timer {
/// Start timing on default command queue /// Start timing on default command queue
inline void start() { inline void start() {
UCL_OCL_MARKER(_cq,&start_event); if (has_measured_time) {
clReleaseEvent(start_event);
clReleaseEvent(stop_event);
has_measured_time = false; has_measured_time = false;
} }
UCL_OCL_MARKER(_cq,&start_event);
}
/// Stop timing on default command queue /// Stop timing on default command queue
inline void stop() { inline void stop() {
@ -83,9 +91,13 @@ class UCL_Timer {
/// Block until the start event has been reached on device /// Block until the start event has been reached on device
inline void sync_start() { inline void sync_start() {
CL_SAFE_CALL(clWaitForEvents(1,&start_event)); if (has_measured_time) {
clReleaseEvent(start_event);
clReleaseEvent(stop_event);
has_measured_time = false; has_measured_time = false;
} }
CL_SAFE_CALL(clWaitForEvents(1,&start_event));
}
/// Block until the stop event has been reached on device /// Block until the stop event has been reached on device
inline void sync_stop() { inline void sync_stop() {

View File

@ -265,15 +265,13 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
// Time on the device only if 1 proc per gpu // Time on the device only if 1 proc per gpu
_time_device=true; _time_device=true;
#if 0 // Previous source of OCL memory leak when time_device=false
// XXX: the following setting triggers a memory leak with OpenCL and MPI // - Logic added to release OCL events when timers are not invoked
// setting _time_device=true for all processes doesn't seem to be a
// problem with either (no segfault, no (large) memory leak.
// thus keeping this disabled for now. may need to review later.
// 2018-07-23 <akohlmey@gmail.com>
if (_procs_per_gpu>1) if (_procs_per_gpu>1)
_time_device=false; _time_device=false;
#endif
if (!_time_device && _particle_split > 0)
gpu->configure_profiling(false);
// Set up a per device communicator // Set up a per device communicator
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
@ -715,7 +713,9 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
dev_data_out[0].flush(); dev_data_out[0].flush();
#endif #endif
driver_time=MPI_Wtime()-driver_time; driver_time=MPI_Wtime()-driver_time;
double time=over_timer.seconds(); double time=0.0;
if (_time_device)
time=over_timer.seconds();
if (time_device()) { if (time_device()) {
for (int i=0; i<_data_in_estimate; i++) for (int i=0; i<_data_in_estimate; i++)

View File

@ -304,6 +304,7 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
const double delxinv, const double delyinv, const double delxinv, const double delyinv,
const double delzinv) { const double delzinv) {
if (!_precompute_done) { if (!_precompute_done) {
if (device->time_device())
atom->acc_timers(); atom->acc_timers();
_precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv, _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
delyinv,delzinv); delyinv,delzinv);