Merge pull request #1008 from akohlmey/support-platform-select

OpenCL related fixes and improvements to the GPU package
2018-07-20 14:32:49 -06:00
parent 2732b8b647 de8176b4fc
commit 884e72a4ba
5 changed files with 74 additions and 22 deletions
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -165,8 +165,8 @@ class UCL_Device {
  /// Get the current OpenCL device name
  inline std::string name() { return name(_device); }
  /// Get the OpenCL device name
-  inline std::string name(const int i)
-    { return std::string(_properties[i].name); }
+  inline std::string name(const int i) {
+    return std::string(_properties[i].name); }

  /// Get a string telling the type of the current device
  inline std::string device_type_name() { return device_type_name(_device); }
@ -281,7 +281,7 @@ class UCL_Device {
  inline cl_device_id & cl_device() { return _cl_device; }

  /// Select the platform that has accelerators
-  inline void set_platform_accelerator(int pid=-1);
+  inline int set_platform_accelerator(int pid=-1);

 private:
  int _num_platforms;          // Number of platforms
@ -324,6 +324,7 @@ UCL_Device::~UCL_Device() {

 void UCL_Device::clear() {
  _properties.clear();
+  _cl_devices.clear();
  if (_device>-1) {
    for (size_t i=0; i<_cq.size(); i++) {
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
@ -520,8 +521,6 @@ int UCL_Device::device_type(const int i) {

 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
-  clear();
-
  cl_device_id *device_list = new cl_device_id[_num_devices];
  cl_uint n;
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@ -612,7 +611,7 @@ void UCL_Device::print_all(std::ostream &out) {

 // Select the platform that is associated with accelerators
 // if pid < 0, select the first platform
-void UCL_Device::set_platform_accelerator(int pid) {
+int UCL_Device::set_platform_accelerator(int pid) {
  if (pid < 0) {
    int found = 0;
    for (int n=0; n<_num_platforms; n++) {
@ -625,10 +624,11 @@ void UCL_Device::set_platform_accelerator(int pid) {
          break;
        }
      }
-      if (found) break;
+      if (found) return UCL_SUCCESS;
    }
+    return UCL_ERROR;
  } else {
-    set_platform(pid);
+    return set_platform(pid);
  }
 }

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -34,8 +34,8 @@ using namespace LAMMPS_AL;

 template <class numtyp, class acctyp>
 DeviceT::Device() : _init_count(0), _device_init(false),
-                                  _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0), _compiled(false) {
+                    _gpu_mode(GPU_FORCE), _first_device(0),
+                    _last_device(0), _platform_id(-1), _compiled(false) {
 }

 template <class numtyp, class acctyp>
@ -67,6 +67,17 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
  _particle_split=p_split;
  _cell_size=cell_size;
  _block_pair=block_pair;
+  // support selecting platform though "package device" keyword.
+  // "0:generic" will select platform 0 and tune for generic device
+  // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
+  if (ocl_vendor) {
+    char *sep = NULL;
+    if ((sep = strstr(ocl_vendor,":"))) {
+      *sep = '\0';
+      _platform_id = atoi(ocl_vendor);
+      ocl_vendor = sep+1;
+    }
+  }

  // Get the rank/size within the world
  MPI_Comm_rank(_comm_world,&_world_me);
@ -135,6 +146,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
    return -7;
  #endif

+  if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
+    return -12;
+
  if (gpu->set(my_gpu)!=UCL_SUCCESS)
    return -6;

@ -191,13 +205,15 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
    _ocl_vendor_string="-DUSE_OPENCL";
    int token_count=0;
    std::string params[13];
-    char *pch = strtok(ocl_vendor,"\" ");
+    char *pch = strtok(ocl_vendor,",");
+    pch = strtok(NULL,",");
+    if (pch == NULL) return -11;
    while (pch != NULL) {
      if (token_count==13)
        return -11;
      params[token_count]=pch;
      token_count++;
-      pch = strtok(NULL,"\" ");
+      pch = strtok(NULL,",");
    }
    _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
                        " -DTHREADS_PER_ATOM="+params[1]+
@ -656,7 +672,7 @@ int DeviceT::compile_kernels() {
  dev_program=new UCL_Program(*gpu);
  int success=dev_program->load_string(device,compile_string().c_str());
  if (success!=UCL_SUCCESS)
-    return -4;
+    return -6;
  k_zero.set_function(*dev_program,"kernel_zero");
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -292,7 +292,7 @@ class Device {
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
      _replica_size;
-  int _gpu_mode, _first_device, _last_device, _nthreads;
+  int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
  double _particle_split;
  double _cpu_full;
  double _ptx_arch;