GPU Package: Switching back to timer disabling with multiple MPI tasks per GPU. Logic added to prevent mem leak.
This commit is contained in:
@ -81,6 +81,10 @@ class UCL_Device {
|
|||||||
/// Return the number of devices that support CUDA
|
/// Return the number of devices that support CUDA
|
||||||
inline int num_devices() { return _properties.size(); }
|
inline int num_devices() { return _properties.size(); }
|
||||||
|
|
||||||
|
/// Specify whether profiling (device timers) will be used for the device (yes=true)
|
||||||
|
/** No-op for CUDA and HIP **/
|
||||||
|
inline void configure_profiling(const bool profiling_on) {}
|
||||||
|
|
||||||
/// Set the CUDA device to the specified device number
|
/// Set the CUDA device to the specified device number
|
||||||
/** A context and default command queue will be created for the device
|
/** A context and default command queue will be created for the device
|
||||||
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
||||||
|
|||||||
@ -95,6 +95,10 @@ class UCL_Device {
|
|||||||
/// Return the number of devices that support CUDA
|
/// Return the number of devices that support CUDA
|
||||||
inline int num_devices() { return _properties.size(); }
|
inline int num_devices() { return _properties.size(); }
|
||||||
|
|
||||||
|
/// Specify whether profiling (device timers) will be used for the device (yes=true)
|
||||||
|
/** No-op for CUDA and HIP **/
|
||||||
|
inline void configure_profiling(const bool profiling_on) {}
|
||||||
|
|
||||||
/// Set the CUDA device to the specified device number
|
/// Set the CUDA device to the specified device number
|
||||||
/** A context and default command queue will be created for the device
|
/** A context and default command queue will be created for the device
|
||||||
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
||||||
|
|||||||
@ -125,6 +125,11 @@ class UCL_Device {
|
|||||||
/// Return the number of devices that support OpenCL
|
/// Return the number of devices that support OpenCL
|
||||||
inline int num_devices() { return _num_devices; }
|
inline int num_devices() { return _num_devices; }
|
||||||
|
|
||||||
|
/// Specify whether profiling (device timers) will be used for the device (yes=true)
|
||||||
|
/** No-op for CUDA and HIP **/
|
||||||
|
inline void configure_profiling(const bool profiling_on)
|
||||||
|
{ _cq_profiling = profiling_on; }
|
||||||
|
|
||||||
/// Set the OpenCL device to the specified device number
|
/// Set the OpenCL device to the specified device number
|
||||||
/** A context and default command queue will be created for the device *
|
/** A context and default command queue will be created for the device *
|
||||||
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
* Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
|
||||||
@ -169,10 +174,22 @@ class UCL_Device {
|
|||||||
_cq.push_back(cl_command_queue());
|
_cq.push_back(cl_command_queue());
|
||||||
|
|
||||||
#ifdef CL_VERSION_2_0
|
#ifdef CL_VERSION_2_0
|
||||||
cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
|
if (_cq_profiling) {
|
||||||
_cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props, &errorv);
|
cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE,
|
||||||
|
0};
|
||||||
|
_cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
|
||||||
|
&errorv);
|
||||||
|
} else {
|
||||||
|
cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, 0};
|
||||||
|
_cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props,
|
||||||
|
&errorv);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
_cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE, &errorv);
|
if (_cq_profiling)
|
||||||
|
_cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE,
|
||||||
|
&errorv);
|
||||||
|
else
|
||||||
|
_cq.back()=clCreateCommandQueue(_context, _cl_device, 0, &errorv);
|
||||||
#endif
|
#endif
|
||||||
if (errorv!=CL_SUCCESS) {
|
if (errorv!=CL_SUCCESS) {
|
||||||
std::cerr << "Could not create command queue on device: " << name()
|
std::cerr << "Could not create command queue on device: " << name()
|
||||||
@ -370,6 +387,7 @@ class UCL_Device {
|
|||||||
cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
|
cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
|
||||||
cl_context _context; // Context used for accessing the device
|
cl_context _context; // Context used for accessing the device
|
||||||
std::vector<cl_command_queue> _cq;// The default command queue for this device
|
std::vector<cl_command_queue> _cq;// The default command queue for this device
|
||||||
|
bool _cq_profiling; // True=create command queues w/ profiling support
|
||||||
int _device; // UCL_Device ID for current device
|
int _device; // UCL_Device ID for current device
|
||||||
cl_device_id _cl_device; // OpenCL ID for current device
|
cl_device_id _cl_device; // OpenCL ID for current device
|
||||||
std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices
|
std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices
|
||||||
@ -384,6 +402,7 @@ class UCL_Device {
|
|||||||
// Grabs the properties for all devices
|
// Grabs the properties for all devices
|
||||||
UCL_Device::UCL_Device() {
|
UCL_Device::UCL_Device() {
|
||||||
_device=-1;
|
_device=-1;
|
||||||
|
_cq_profiling=true;
|
||||||
|
|
||||||
// --- Get Number of Platforms
|
// --- Get Number of Platforms
|
||||||
cl_uint nplatforms;
|
cl_uint nplatforms;
|
||||||
|
|||||||
@ -50,11 +50,15 @@ class UCL_Timer {
|
|||||||
/** \note init() must be called to reuse timer after a clear() **/
|
/** \note init() must be called to reuse timer after a clear() **/
|
||||||
inline void clear() {
|
inline void clear() {
|
||||||
if (_initialized) {
|
if (_initialized) {
|
||||||
|
if (has_measured_time) {
|
||||||
|
clReleaseEvent(start_event);
|
||||||
|
clReleaseEvent(stop_event);
|
||||||
|
has_measured_time = false;
|
||||||
|
}
|
||||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
|
||||||
_initialized=false;
|
_initialized=false;
|
||||||
_total_time=0.0;
|
_total_time=0.0;
|
||||||
}
|
}
|
||||||
has_measured_time = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialize default command queue for timing
|
/// Initialize default command queue for timing
|
||||||
@ -71,8 +75,12 @@ class UCL_Timer {
|
|||||||
|
|
||||||
/// Start timing on default command queue
|
/// Start timing on default command queue
|
||||||
inline void start() {
|
inline void start() {
|
||||||
|
if (has_measured_time) {
|
||||||
|
clReleaseEvent(start_event);
|
||||||
|
clReleaseEvent(stop_event);
|
||||||
|
has_measured_time = false;
|
||||||
|
}
|
||||||
UCL_OCL_MARKER(_cq,&start_event);
|
UCL_OCL_MARKER(_cq,&start_event);
|
||||||
has_measured_time = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Stop timing on default command queue
|
/// Stop timing on default command queue
|
||||||
@ -83,8 +91,12 @@ class UCL_Timer {
|
|||||||
|
|
||||||
/// Block until the start event has been reached on device
|
/// Block until the start event has been reached on device
|
||||||
inline void sync_start() {
|
inline void sync_start() {
|
||||||
|
if (has_measured_time) {
|
||||||
|
clReleaseEvent(start_event);
|
||||||
|
clReleaseEvent(stop_event);
|
||||||
|
has_measured_time = false;
|
||||||
|
}
|
||||||
CL_SAFE_CALL(clWaitForEvents(1,&start_event));
|
CL_SAFE_CALL(clWaitForEvents(1,&start_event));
|
||||||
has_measured_time = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Block until the stop event has been reached on device
|
/// Block until the stop event has been reached on device
|
||||||
|
|||||||
@ -265,15 +265,13 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
|
|||||||
// Time on the device only if 1 proc per gpu
|
// Time on the device only if 1 proc per gpu
|
||||||
_time_device=true;
|
_time_device=true;
|
||||||
|
|
||||||
#if 0
|
// Previous source of OCL memory leak when time_device=false
|
||||||
// XXX: the following setting triggers a memory leak with OpenCL and MPI
|
// - Logic added to release OCL events when timers are not invoked
|
||||||
// setting _time_device=true for all processes doesn't seem to be a
|
|
||||||
// problem with either (no segfault, no (large) memory leak.
|
|
||||||
// thus keeping this disabled for now. may need to review later.
|
|
||||||
// 2018-07-23 <akohlmey@gmail.com>
|
|
||||||
if (_procs_per_gpu>1)
|
if (_procs_per_gpu>1)
|
||||||
_time_device=false;
|
_time_device=false;
|
||||||
#endif
|
|
||||||
|
if (!_time_device && _particle_split > 0)
|
||||||
|
gpu->configure_profiling(false);
|
||||||
|
|
||||||
// Set up a per device communicator
|
// Set up a per device communicator
|
||||||
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
|
||||||
@ -715,7 +713,9 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
|
|||||||
dev_data_out[0].flush();
|
dev_data_out[0].flush();
|
||||||
#endif
|
#endif
|
||||||
driver_time=MPI_Wtime()-driver_time;
|
driver_time=MPI_Wtime()-driver_time;
|
||||||
double time=over_timer.seconds();
|
double time=0.0;
|
||||||
|
if (_time_device)
|
||||||
|
time=over_timer.seconds();
|
||||||
|
|
||||||
if (time_device()) {
|
if (time_device()) {
|
||||||
for (int i=0; i<_data_in_estimate; i++)
|
for (int i=0; i<_data_in_estimate; i++)
|
||||||
|
|||||||
@ -304,7 +304,8 @@ int PPPMT::spread(const int ago, const int nlocal, const int nall,
|
|||||||
const double delxinv, const double delyinv,
|
const double delxinv, const double delyinv,
|
||||||
const double delzinv) {
|
const double delzinv) {
|
||||||
if (!_precompute_done) {
|
if (!_precompute_done) {
|
||||||
atom->acc_timers();
|
if (device->time_device())
|
||||||
|
atom->acc_timers();
|
||||||
_precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
|
_precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
|
||||||
delyinv,delzinv);
|
delyinv,delzinv);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user