various minor OpenCL related fixes and improvements to the GPU package

- document previously undocumented OpenCL tune settings
- implement OpenCL platform selection through prefixing the device type with the platform id separated by a colon
- allow passing custom tune parameters though postfixing the device type with the 13 tuneable parameters separated by commas
- remove an extra clear() that would delete device properties structs an cause LAMMPS to output garbage strings
This commit is contained in:
Axel Kohlmeyer
2018-07-20 14:41:54 -04:00
parent 36081f9ffd
commit de8176b4fc
5 changed files with 74 additions and 22 deletions

View File

@ -33,8 +33,10 @@ args = arguments specific to the style :l
last = ID of last GPU to be used on each node last = ID of last GPU to be used on each node
{tpa} value = Nthreads {tpa} value = Nthreads
Nthreads = # of GPU threads used per atom Nthreads = # of GPU threads used per atom
{device} value = device_type {device} value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
device_type = {kepler} or {fermi} or {cypress} or {generic} platform_id = numerical OpenCL platform id (default: -1)
device_type = {kepler} or {fermi} or {cypress} or {intel} or {phi} or {generic} or {custom}
val1,val2,... = custom OpenCL tune parameters (see below for details)
{blocksize} value = size {blocksize} value = size
size = thread block size for pair force computation size = thread block size for pair force computation
{intel} args = NPhi keyword value ... {intel} args = NPhi keyword value ...
@ -96,6 +98,9 @@ args = arguments specific to the style :l
package gpu 1 package gpu 1
package gpu 1 split 0.75 package gpu 1 split 0.75
package gpu 2 split -1.0 package gpu 2 split -1.0
package gpu 1 device kepler
package gpu 1 device 2:generic
package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
package kokkos neigh half comm device package kokkos neigh half comm device
package omp 0 neigh no package omp 0 neigh no
package omp 4 package omp 4
@ -244,12 +249,40 @@ the value can improve performance. The number of threads per atom must
be a power of 2 and currently cannot be greater than 32. be a power of 2 and currently cannot be greater than 32.
The {device} keyword can be used to tune parameters optimized for a The {device} keyword can be used to tune parameters optimized for a
specific accelerator, when using OpenCL. For CUDA, the {device} specific accelerator and platform when using OpenCL. OpenCL supports
keyword is ignored. Currently, the device type is limited to NVIDIA the concept of a [platform], which represents one or more devices that
Kepler, NVIDIA Fermi, AMD Cypress, or a generic device. More devices share the same driver (e.g. there would be a different platform for
may be added later. The default device type can be specified when GPUs from different vendors or for CPU based accelerator support).
building LAMMPS with the GPU library, via settings in the In LAMMPS only one platform can be active at a time and by default
lib/gpu/Makefile that is used. the first platform with an accelerator is selected. This is equivalent
to using a platform ID of -1. The platform ID is a number corresponding
to the output of the ocl_get_devices tool. The platform ID is passed
to the GPU library, by prefixing the {device} keyword with that number
separated by a colon. For CUDA, the {device} keyword is ignored.
Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
More devices may be added later. The default device type can be
specified when building LAMMPS with the GPU library, via setting a
variable in the lib/gpu/Makefile that is used.
In addition, a device type {custom} is available, which is followed by
13 comma separated numbers, which allows to set those tweakable parameters
from the package command. It can be combined with the (colon separated)
platform id. The individual settings are:
MEM_THREADS
THREADS_PER_ATOM
THREADS_PER_CHARGE
BLOCK_PAIR
MAX_SHARED_TYPES
BLOCK_NBOR_BUILD
BLOCK_BIO_PAIR
BLOCK_ELLIPSE
WARP_SIZE
PPPM_BLOCK_1D
BLOCK_CELL_2D
BLOCK_CELL_ID
MAX_BIO_SHARED_TYPES :ul
The {blocksize} keyword allows you to tweak the number of threads used The {blocksize} keyword allows you to tweak the number of threads used
per thread block. This number should be a multiple of 32 (for GPUs) per thread block. This number should be a multiple of 32 (for GPUs)

View File

@ -165,8 +165,8 @@ class UCL_Device {
/// Get the current OpenCL device name /// Get the current OpenCL device name
inline std::string name() { return name(_device); } inline std::string name() { return name(_device); }
/// Get the OpenCL device name /// Get the OpenCL device name
inline std::string name(const int i) inline std::string name(const int i) {
{ return std::string(_properties[i].name); } return std::string(_properties[i].name); }
/// Get a string telling the type of the current device /// Get a string telling the type of the current device
inline std::string device_type_name() { return device_type_name(_device); } inline std::string device_type_name() { return device_type_name(_device); }
@ -281,7 +281,7 @@ class UCL_Device {
inline cl_device_id & cl_device() { return _cl_device; } inline cl_device_id & cl_device() { return _cl_device; }
/// Select the platform that has accelerators /// Select the platform that has accelerators
inline void set_platform_accelerator(int pid=-1); inline int set_platform_accelerator(int pid=-1);
private: private:
int _num_platforms; // Number of platforms int _num_platforms; // Number of platforms
@ -324,6 +324,7 @@ UCL_Device::~UCL_Device() {
void UCL_Device::clear() { void UCL_Device::clear() {
_properties.clear(); _properties.clear();
_cl_devices.clear();
if (_device>-1) { if (_device>-1) {
for (size_t i=0; i<_cq.size(); i++) { for (size_t i=0; i<_cq.size(); i++) {
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back())); CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
@ -520,8 +521,6 @@ int UCL_Device::device_type(const int i) {
// Set the CUDA device to the specified device number // Set the CUDA device to the specified device number
int UCL_Device::set(int num) { int UCL_Device::set(int num) {
clear();
cl_device_id *device_list = new cl_device_id[_num_devices]; cl_device_id *device_list = new cl_device_id[_num_devices];
cl_uint n; cl_uint n;
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices, CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@ -612,7 +611,7 @@ void UCL_Device::print_all(std::ostream &out) {
// Select the platform that is associated with accelerators // Select the platform that is associated with accelerators
// if pid < 0, select the first platform // if pid < 0, select the first platform
void UCL_Device::set_platform_accelerator(int pid) { int UCL_Device::set_platform_accelerator(int pid) {
if (pid < 0) { if (pid < 0) {
int found = 0; int found = 0;
for (int n=0; n<_num_platforms; n++) { for (int n=0; n<_num_platforms; n++) {
@ -625,10 +624,11 @@ void UCL_Device::set_platform_accelerator(int pid) {
break; break;
} }
} }
if (found) break; if (found) return UCL_SUCCESS;
} }
return UCL_ERROR;
} else { } else {
set_platform(pid); return set_platform(pid);
} }
} }

View File

@ -34,8 +34,8 @@ using namespace LAMMPS_AL;
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
DeviceT::Device() : _init_count(0), _device_init(false), DeviceT::Device() : _init_count(0), _device_init(false),
_gpu_mode(GPU_FORCE), _first_device(0), _gpu_mode(GPU_FORCE), _first_device(0),
_last_device(0), _compiled(false) { _last_device(0), _platform_id(-1), _compiled(false) {
} }
template <class numtyp, class acctyp> template <class numtyp, class acctyp>
@ -67,6 +67,17 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
_particle_split=p_split; _particle_split=p_split;
_cell_size=cell_size; _cell_size=cell_size;
_block_pair=block_pair; _block_pair=block_pair;
// support selecting platform though "package device" keyword.
// "0:generic" will select platform 0 and tune for generic device
// "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
if (ocl_vendor) {
char *sep = NULL;
if ((sep = strstr(ocl_vendor,":"))) {
*sep = '\0';
_platform_id = atoi(ocl_vendor);
ocl_vendor = sep+1;
}
}
// Get the rank/size within the world // Get the rank/size within the world
MPI_Comm_rank(_comm_world,&_world_me); MPI_Comm_rank(_comm_world,&_world_me);
@ -135,6 +146,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
return -7; return -7;
#endif #endif
if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
return -12;
if (gpu->set(my_gpu)!=UCL_SUCCESS) if (gpu->set(my_gpu)!=UCL_SUCCESS)
return -6; return -6;
@ -191,13 +205,15 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
_ocl_vendor_string="-DUSE_OPENCL"; _ocl_vendor_string="-DUSE_OPENCL";
int token_count=0; int token_count=0;
std::string params[13]; std::string params[13];
char *pch = strtok(ocl_vendor,"\" "); char *pch = strtok(ocl_vendor,",");
pch = strtok(NULL,",");
if (pch == NULL) return -11;
while (pch != NULL) { while (pch != NULL) {
if (token_count==13) if (token_count==13)
return -11; return -11;
params[token_count]=pch; params[token_count]=pch;
token_count++; token_count++;
pch = strtok(NULL,"\" "); pch = strtok(NULL,",");
} }
_ocl_vendor_string+=" -DMEM_THREADS="+params[0]+ _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
" -DTHREADS_PER_ATOM="+params[1]+ " -DTHREADS_PER_ATOM="+params[1]+
@ -656,7 +672,7 @@ int DeviceT::compile_kernels() {
dev_program=new UCL_Program(*gpu); dev_program=new UCL_Program(*gpu);
int success=dev_program->load_string(device,compile_string().c_str()); int success=dev_program->load_string(device,compile_string().c_str());
if (success!=UCL_SUCCESS) if (success!=UCL_SUCCESS)
return -4; return -6;
k_zero.set_function(*dev_program,"kernel_zero"); k_zero.set_function(*dev_program,"kernel_zero");
k_info.set_function(*dev_program,"kernel_info"); k_info.set_function(*dev_program,"kernel_info");
_compiled=true; _compiled=true;

View File

@ -292,7 +292,7 @@ class Device {
MPI_Comm _comm_world, _comm_replica, _comm_gpu; MPI_Comm _comm_world, _comm_replica, _comm_gpu;
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
_replica_size; _replica_size;
int _gpu_mode, _first_device, _last_device, _nthreads; int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
double _particle_split; double _particle_split;
double _cpu_full; double _cpu_full;
double _ptx_arch; double _ptx_arch;

View File

@ -58,6 +58,9 @@ namespace GPU_EXTRA {
else if (all_success == -11) else if (all_success == -11)
error->all(FLERR, error->all(FLERR,
"Invalid custom OpenCL parameter string."); "Invalid custom OpenCL parameter string.");
else if (all_success == -12)
error->all(FLERR,
"Invalid OpenCL platform ID.");
else else
error->all(FLERR,"Unknown error in GPU library"); error->all(FLERR,"Unknown error in GPU library");
} }