various minor OpenCL related fixes and improvements to the GPU package
- document previously undocumented OpenCL tune settings - implement OpenCL platform selection through prefixing the device type with the platform id separated by a colon - allow passing custom tune parameters though postfixing the device type with the 13 tuneable parameters separated by commas - remove an extra clear() that would delete device properties structs an cause LAMMPS to output garbage strings
This commit is contained in:
@ -33,8 +33,10 @@ args = arguments specific to the style :l
|
|||||||
last = ID of last GPU to be used on each node
|
last = ID of last GPU to be used on each node
|
||||||
{tpa} value = Nthreads
|
{tpa} value = Nthreads
|
||||||
Nthreads = # of GPU threads used per atom
|
Nthreads = # of GPU threads used per atom
|
||||||
{device} value = device_type
|
{device} value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
|
||||||
device_type = {kepler} or {fermi} or {cypress} or {generic}
|
platform_id = numerical OpenCL platform id (default: -1)
|
||||||
|
device_type = {kepler} or {fermi} or {cypress} or {intel} or {phi} or {generic} or {custom}
|
||||||
|
val1,val2,... = custom OpenCL tune parameters (see below for details)
|
||||||
{blocksize} value = size
|
{blocksize} value = size
|
||||||
size = thread block size for pair force computation
|
size = thread block size for pair force computation
|
||||||
{intel} args = NPhi keyword value ...
|
{intel} args = NPhi keyword value ...
|
||||||
@ -96,6 +98,9 @@ args = arguments specific to the style :l
|
|||||||
package gpu 1
|
package gpu 1
|
||||||
package gpu 1 split 0.75
|
package gpu 1 split 0.75
|
||||||
package gpu 2 split -1.0
|
package gpu 2 split -1.0
|
||||||
|
package gpu 1 device kepler
|
||||||
|
package gpu 1 device 2:generic
|
||||||
|
package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
|
||||||
package kokkos neigh half comm device
|
package kokkos neigh half comm device
|
||||||
package omp 0 neigh no
|
package omp 0 neigh no
|
||||||
package omp 4
|
package omp 4
|
||||||
@ -244,12 +249,40 @@ the value can improve performance. The number of threads per atom must
|
|||||||
be a power of 2 and currently cannot be greater than 32.
|
be a power of 2 and currently cannot be greater than 32.
|
||||||
|
|
||||||
The {device} keyword can be used to tune parameters optimized for a
|
The {device} keyword can be used to tune parameters optimized for a
|
||||||
specific accelerator, when using OpenCL. For CUDA, the {device}
|
specific accelerator and platform when using OpenCL. OpenCL supports
|
||||||
keyword is ignored. Currently, the device type is limited to NVIDIA
|
the concept of a [platform], which represents one or more devices that
|
||||||
Kepler, NVIDIA Fermi, AMD Cypress, or a generic device. More devices
|
share the same driver (e.g. there would be a different platform for
|
||||||
may be added later. The default device type can be specified when
|
GPUs from different vendors or for CPU based accelerator support).
|
||||||
building LAMMPS with the GPU library, via settings in the
|
In LAMMPS only one platform can be active at a time and by default
|
||||||
lib/gpu/Makefile that is used.
|
the first platform with an accelerator is selected. This is equivalent
|
||||||
|
to using a platform ID of -1. The platform ID is a number corresponding
|
||||||
|
to the output of the ocl_get_devices tool. The platform ID is passed
|
||||||
|
to the GPU library, by prefixing the {device} keyword with that number
|
||||||
|
separated by a colon. For CUDA, the {device} keyword is ignored.
|
||||||
|
Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
|
||||||
|
Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
|
||||||
|
More devices may be added later. The default device type can be
|
||||||
|
specified when building LAMMPS with the GPU library, via setting a
|
||||||
|
variable in the lib/gpu/Makefile that is used.
|
||||||
|
|
||||||
|
In addition, a device type {custom} is available, which is followed by
|
||||||
|
13 comma separated numbers, which allows to set those tweakable parameters
|
||||||
|
from the package command. It can be combined with the (colon separated)
|
||||||
|
platform id. The individual settings are:
|
||||||
|
|
||||||
|
MEM_THREADS
|
||||||
|
THREADS_PER_ATOM
|
||||||
|
THREADS_PER_CHARGE
|
||||||
|
BLOCK_PAIR
|
||||||
|
MAX_SHARED_TYPES
|
||||||
|
BLOCK_NBOR_BUILD
|
||||||
|
BLOCK_BIO_PAIR
|
||||||
|
BLOCK_ELLIPSE
|
||||||
|
WARP_SIZE
|
||||||
|
PPPM_BLOCK_1D
|
||||||
|
BLOCK_CELL_2D
|
||||||
|
BLOCK_CELL_ID
|
||||||
|
MAX_BIO_SHARED_TYPES :ul
|
||||||
|
|
||||||
The {blocksize} keyword allows you to tweak the number of threads used
|
The {blocksize} keyword allows you to tweak the number of threads used
|
||||||
per thread block. This number should be a multiple of 32 (for GPUs)
|
per thread block. This number should be a multiple of 32 (for GPUs)
|
||||||
|
|||||||
@ -165,8 +165,8 @@ class UCL_Device {
|
|||||||
/// Get the current OpenCL device name
|
/// Get the current OpenCL device name
|
||||||
inline std::string name() { return name(_device); }
|
inline std::string name() { return name(_device); }
|
||||||
/// Get the OpenCL device name
|
/// Get the OpenCL device name
|
||||||
inline std::string name(const int i)
|
inline std::string name(const int i) {
|
||||||
{ return std::string(_properties[i].name); }
|
return std::string(_properties[i].name); }
|
||||||
|
|
||||||
/// Get a string telling the type of the current device
|
/// Get a string telling the type of the current device
|
||||||
inline std::string device_type_name() { return device_type_name(_device); }
|
inline std::string device_type_name() { return device_type_name(_device); }
|
||||||
@ -281,7 +281,7 @@ class UCL_Device {
|
|||||||
inline cl_device_id & cl_device() { return _cl_device; }
|
inline cl_device_id & cl_device() { return _cl_device; }
|
||||||
|
|
||||||
/// Select the platform that has accelerators
|
/// Select the platform that has accelerators
|
||||||
inline void set_platform_accelerator(int pid=-1);
|
inline int set_platform_accelerator(int pid=-1);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int _num_platforms; // Number of platforms
|
int _num_platforms; // Number of platforms
|
||||||
@ -324,6 +324,7 @@ UCL_Device::~UCL_Device() {
|
|||||||
|
|
||||||
void UCL_Device::clear() {
|
void UCL_Device::clear() {
|
||||||
_properties.clear();
|
_properties.clear();
|
||||||
|
_cl_devices.clear();
|
||||||
if (_device>-1) {
|
if (_device>-1) {
|
||||||
for (size_t i=0; i<_cq.size(); i++) {
|
for (size_t i=0; i<_cq.size(); i++) {
|
||||||
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
|
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
|
||||||
@ -520,8 +521,6 @@ int UCL_Device::device_type(const int i) {
|
|||||||
|
|
||||||
// Set the CUDA device to the specified device number
|
// Set the CUDA device to the specified device number
|
||||||
int UCL_Device::set(int num) {
|
int UCL_Device::set(int num) {
|
||||||
clear();
|
|
||||||
|
|
||||||
cl_device_id *device_list = new cl_device_id[_num_devices];
|
cl_device_id *device_list = new cl_device_id[_num_devices];
|
||||||
cl_uint n;
|
cl_uint n;
|
||||||
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
|
CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
|
||||||
@ -612,7 +611,7 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||||||
|
|
||||||
// Select the platform that is associated with accelerators
|
// Select the platform that is associated with accelerators
|
||||||
// if pid < 0, select the first platform
|
// if pid < 0, select the first platform
|
||||||
void UCL_Device::set_platform_accelerator(int pid) {
|
int UCL_Device::set_platform_accelerator(int pid) {
|
||||||
if (pid < 0) {
|
if (pid < 0) {
|
||||||
int found = 0;
|
int found = 0;
|
||||||
for (int n=0; n<_num_platforms; n++) {
|
for (int n=0; n<_num_platforms; n++) {
|
||||||
@ -625,10 +624,11 @@ void UCL_Device::set_platform_accelerator(int pid) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (found) break;
|
if (found) return UCL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
return UCL_ERROR;
|
||||||
} else {
|
} else {
|
||||||
set_platform(pid);
|
return set_platform(pid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,8 +34,8 @@ using namespace LAMMPS_AL;
|
|||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
DeviceT::Device() : _init_count(0), _device_init(false),
|
DeviceT::Device() : _init_count(0), _device_init(false),
|
||||||
_gpu_mode(GPU_FORCE), _first_device(0),
|
_gpu_mode(GPU_FORCE), _first_device(0),
|
||||||
_last_device(0), _compiled(false) {
|
_last_device(0), _platform_id(-1), _compiled(false) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class numtyp, class acctyp>
|
template <class numtyp, class acctyp>
|
||||||
@ -67,6 +67,17 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
|||||||
_particle_split=p_split;
|
_particle_split=p_split;
|
||||||
_cell_size=cell_size;
|
_cell_size=cell_size;
|
||||||
_block_pair=block_pair;
|
_block_pair=block_pair;
|
||||||
|
// support selecting platform though "package device" keyword.
|
||||||
|
// "0:generic" will select platform 0 and tune for generic device
|
||||||
|
// "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
|
||||||
|
if (ocl_vendor) {
|
||||||
|
char *sep = NULL;
|
||||||
|
if ((sep = strstr(ocl_vendor,":"))) {
|
||||||
|
*sep = '\0';
|
||||||
|
_platform_id = atoi(ocl_vendor);
|
||||||
|
ocl_vendor = sep+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get the rank/size within the world
|
// Get the rank/size within the world
|
||||||
MPI_Comm_rank(_comm_world,&_world_me);
|
MPI_Comm_rank(_comm_world,&_world_me);
|
||||||
@ -135,6 +146,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
|
|||||||
return -7;
|
return -7;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
|
||||||
|
return -12;
|
||||||
|
|
||||||
if (gpu->set(my_gpu)!=UCL_SUCCESS)
|
if (gpu->set(my_gpu)!=UCL_SUCCESS)
|
||||||
return -6;
|
return -6;
|
||||||
|
|
||||||
@ -191,13 +205,15 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
|
|||||||
_ocl_vendor_string="-DUSE_OPENCL";
|
_ocl_vendor_string="-DUSE_OPENCL";
|
||||||
int token_count=0;
|
int token_count=0;
|
||||||
std::string params[13];
|
std::string params[13];
|
||||||
char *pch = strtok(ocl_vendor,"\" ");
|
char *pch = strtok(ocl_vendor,",");
|
||||||
|
pch = strtok(NULL,",");
|
||||||
|
if (pch == NULL) return -11;
|
||||||
while (pch != NULL) {
|
while (pch != NULL) {
|
||||||
if (token_count==13)
|
if (token_count==13)
|
||||||
return -11;
|
return -11;
|
||||||
params[token_count]=pch;
|
params[token_count]=pch;
|
||||||
token_count++;
|
token_count++;
|
||||||
pch = strtok(NULL,"\" ");
|
pch = strtok(NULL,",");
|
||||||
}
|
}
|
||||||
_ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
|
_ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
|
||||||
" -DTHREADS_PER_ATOM="+params[1]+
|
" -DTHREADS_PER_ATOM="+params[1]+
|
||||||
@ -656,7 +672,7 @@ int DeviceT::compile_kernels() {
|
|||||||
dev_program=new UCL_Program(*gpu);
|
dev_program=new UCL_Program(*gpu);
|
||||||
int success=dev_program->load_string(device,compile_string().c_str());
|
int success=dev_program->load_string(device,compile_string().c_str());
|
||||||
if (success!=UCL_SUCCESS)
|
if (success!=UCL_SUCCESS)
|
||||||
return -4;
|
return -6;
|
||||||
k_zero.set_function(*dev_program,"kernel_zero");
|
k_zero.set_function(*dev_program,"kernel_zero");
|
||||||
k_info.set_function(*dev_program,"kernel_info");
|
k_info.set_function(*dev_program,"kernel_info");
|
||||||
_compiled=true;
|
_compiled=true;
|
||||||
|
|||||||
@ -292,7 +292,7 @@ class Device {
|
|||||||
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
|
MPI_Comm _comm_world, _comm_replica, _comm_gpu;
|
||||||
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
|
int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
|
||||||
_replica_size;
|
_replica_size;
|
||||||
int _gpu_mode, _first_device, _last_device, _nthreads;
|
int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
|
||||||
double _particle_split;
|
double _particle_split;
|
||||||
double _cpu_full;
|
double _cpu_full;
|
||||||
double _ptx_arch;
|
double _ptx_arch;
|
||||||
|
|||||||
@ -58,6 +58,9 @@ namespace GPU_EXTRA {
|
|||||||
else if (all_success == -11)
|
else if (all_success == -11)
|
||||||
error->all(FLERR,
|
error->all(FLERR,
|
||||||
"Invalid custom OpenCL parameter string.");
|
"Invalid custom OpenCL parameter string.");
|
||||||
|
else if (all_success == -12)
|
||||||
|
error->all(FLERR,
|
||||||
|
"Invalid OpenCL platform ID.");
|
||||||
else
|
else
|
||||||
error->all(FLERR,"Unknown error in GPU library");
|
error->all(FLERR,"Unknown error in GPU library");
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user