various minor OpenCL related fixes and improvements to the GPU package

- document previously undocumented OpenCL tune settings - implement OpenCL platform selection through prefixing the device type with the platform id separated by a colon - allow passing custom tune parameters though postfixing the device type with the 13 tuneable parameters separated by commas - remove an extra clear() that would delete device properties structs an cause LAMMPS to output garbage strings
2018-07-20 14:41:54 -04:00
parent 36081f9ffd
commit de8176b4fc
5 changed files with 74 additions and 22 deletions
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -33,8 +33,10 @@ args = arguments specific to the style :l
        last = ID of last GPU to be used on each node
      {tpa} value = Nthreads
        Nthreads = # of GPU threads used per atom
-      {device} value = device_type
+      {device} value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
-        device_type = {kepler} or {fermi} or {cypress} or {generic}
+        platform_id = numerical OpenCL platform id (default: -1)
        device_type = {kepler} or {fermi} or {cypress} or {intel} or {phi} or {generic} or {custom}
        val1,val2,... = custom OpenCL tune parameters (see below for details)
      {blocksize} value = size
        size = thread block size for pair force computation
  {intel} args = NPhi keyword value ...
@ -96,6 +98,9 @@ args = arguments specific to the style :l
 package gpu 1
 package gpu 1 split 0.75
 package gpu 2 split -1.0
 package gpu 1 device kepler
 package gpu 1 device 2:generic
 package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
 package kokkos neigh half comm device
 package omp 0 neigh no
 package omp 4
@ -244,12 +249,40 @@ the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 The {device} keyword can be used to tune parameters optimized for a
-specific accelerator, when using OpenCL.  For CUDA, the {device}
+specific accelerator and platform when using OpenCL. OpenCL supports
-keyword is ignored.  Currently, the device type is limited to NVIDIA
+the concept of a [platform], which represents one or more devices that
-Kepler, NVIDIA Fermi, AMD Cypress, or a generic device.  More devices
+share the same driver (e.g. there would be a different platform for
-may be added later.  The default device type can be specified when
+GPUs from different vendors or for CPU based accelerator support).
-building LAMMPS with the GPU library, via settings in the
+In LAMMPS only one platform can be active at a time and by default
-lib/gpu/Makefile that is used.
+the first platform with an accelerator is selected. This is equivalent
 to using a platform ID of -1. The platform ID is a number corresponding
 to the output of the ocl_get_devices tool. The platform ID is passed
 to the GPU library, by prefixing the {device} keyword with that number
 separated by a colon. For CUDA, the {device} keyword is ignored.
 Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
 Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
 More devices may be added later.  The default device type can be
 specified when building LAMMPS with the GPU library, via setting a
 variable in the lib/gpu/Makefile that is used.
 In addition, a device type {custom} is available, which is followed by
 13 comma separated numbers, which allows to set those tweakable parameters
 from the package command. It can be combined with the (colon separated)
 platform id. The individual settings are:
 MEM_THREADS
 THREADS_PER_ATOM
 THREADS_PER_CHARGE
 BLOCK_PAIR
 MAX_SHARED_TYPES
 BLOCK_NBOR_BUILD
 BLOCK_BIO_PAIR
 BLOCK_ELLIPSE
 WARP_SIZE
 PPPM_BLOCK_1D
 BLOCK_CELL_2D
 BLOCK_CELL_ID
 MAX_BIO_SHARED_TYPES :ul
 The {blocksize} keyword allows you to tweak the number of threads used
 per thread block. This number should be a multiple of 32 (for GPUs)
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -165,8 +165,8 @@ class UCL_Device {
  /// Get the current OpenCL device name
  inline std::string name() { return name(_device); }
  /// Get the OpenCL device name
-  inline std::string name(const int i)
+  inline std::string name(const int i) {
-    { return std::string(_properties[i].name); }
+    return std::string(_properties[i].name); }
  /// Get a string telling the type of the current device
  inline std::string device_type_name() { return device_type_name(_device); }
@ -281,7 +281,7 @@ class UCL_Device {
  inline cl_device_id & cl_device() { return _cl_device; }
  /// Select the platform that has accelerators
-  inline void set_platform_accelerator(int pid=-1);
+  inline int set_platform_accelerator(int pid=-1);
 private:
  int _num_platforms;          // Number of platforms
@ -324,6 +324,7 @@ UCL_Device::~UCL_Device() {
 void UCL_Device::clear() {
  _properties.clear();
  _cl_devices.clear();
  if (_device>-1) {
    for (size_t i=0; i<_cq.size(); i++) {
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
@ -520,8 +521,6 @@ int UCL_Device::device_type(const int i) {
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
  clear();
  cl_device_id *device_list = new cl_device_id[_num_devices];
  cl_uint n;
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@ -612,7 +611,7 @@ void UCL_Device::print_all(std::ostream &out) {
 // Select the platform that is associated with accelerators
 // if pid < 0, select the first platform
-void UCL_Device::set_platform_accelerator(int pid) {
+int UCL_Device::set_platform_accelerator(int pid) {
  if (pid < 0) {
    int found = 0;
    for (int n=0; n<_num_platforms; n++) {
@ -625,10 +624,11 @@ void UCL_Device::set_platform_accelerator(int pid) {
          break;
        }
      }
-      if (found) break;
+      if (found) return UCL_SUCCESS;
    }
    return UCL_ERROR;
  } else {
-    set_platform(pid);
+    return set_platform(pid);
  }
 }
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -34,8 +34,8 @@ using namespace LAMMPS_AL;
 template <class numtyp, class acctyp>
 DeviceT::Device() : _init_count(0), _device_init(false),
-                                  _gpu_mode(GPU_FORCE), _first_device(0),
+                    _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0), _compiled(false) {
+                    _last_device(0), _platform_id(-1), _compiled(false) {
 }
 template <class numtyp, class acctyp>
@ -67,6 +67,17 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
  _particle_split=p_split;
  _cell_size=cell_size;
  _block_pair=block_pair;
  // support selecting platform though "package device" keyword.
  // "0:generic" will select platform 0 and tune for generic device
  // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
  if (ocl_vendor) {
    char *sep = NULL;
    if ((sep = strstr(ocl_vendor,":"))) {
      *sep = '\0';
      _platform_id = atoi(ocl_vendor);
      ocl_vendor = sep+1;
    }
  }
  // Get the rank/size within the world
  MPI_Comm_rank(_comm_world,&_world_me);
@ -135,6 +146,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
    return -7;
  #endif
  if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
    return -12;
  if (gpu->set(my_gpu)!=UCL_SUCCESS)
    return -6;
@ -191,13 +205,15 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
    _ocl_vendor_string="-DUSE_OPENCL";
    int token_count=0;
    std::string params[13];
-    char *pch = strtok(ocl_vendor,"\" ");
+    char *pch = strtok(ocl_vendor,",");
    pch = strtok(NULL,",");
    if (pch == NULL) return -11;
    while (pch != NULL) {
      if (token_count==13)
        return -11;
      params[token_count]=pch;
      token_count++;
-      pch = strtok(NULL,"\" ");
+      pch = strtok(NULL,",");
    }
    _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
                        " -DTHREADS_PER_ATOM="+params[1]+
@ -656,7 +672,7 @@ int DeviceT::compile_kernels() {
  dev_program=new UCL_Program(*gpu);
  int success=dev_program->load_string(device,compile_string().c_str());
  if (success!=UCL_SUCCESS)
-    return -4;
+    return -6;
  k_zero.set_function(*dev_program,"kernel_zero");
  k_info.set_function(*dev_program,"kernel_info");
  _compiled=true;
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -292,7 +292,7 @@ class Device {
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
      _replica_size;
-  int _gpu_mode, _first_device, _last_device, _nthreads;
+  int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
  double _particle_split;
  double _cpu_full;
  double _ptx_arch;
--- a/src/GPU/gpu_extra.h
+++ b/src/GPU/gpu_extra.h
@ -58,6 +58,9 @@ namespace GPU_EXTRA {
      else if (all_success == -11)
        error->all(FLERR,
                   "Invalid custom OpenCL parameter string.");
      else if (all_success == -12)
        error->all(FLERR,
                   "Invalid OpenCL platform ID.");
      else
        error->all(FLERR,"Unknown error in GPU library");
    }