remove support for CUDA toolkits before version 8 and GPUs older than Kepler

This commit is contained in:
Axel Kohlmeyer
2022-04-12 15:48:16 -04:00
parent 2ff8ac0cb2
commit f3363070e7
13 changed files with 74 additions and 253 deletions

View File

@ -379,18 +379,9 @@ UCL_Device::UCL_Device() {
prop.regsPerBlock = hip_prop.regsPerBlock;
prop.clockRate = hip_prop.clockRate;
prop.computeMode = hip_prop.computeMode;
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
//#if CUDA_VERSION >= 2020
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
//#endif
//#if CUDA_VERSION >= 3010
prop.concurrentKernels = hip_prop.concurrentKernels;
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
//#endif
_properties.push_back(prop);
}
@ -447,13 +438,11 @@ void UCL_Device::clear() {
// List all devices along with all properties
void UCL_Device::print_all(std::ostream &out) {
//#if CUDA_VERSION >= 2020
int driver_version;
hipDriverGetVersion(&driver_version);
out << "Driver Version: "
<< driver_version/1000 << "." << driver_version%100
<< std::endl;
//#endif
if (num_devices() == 0)
out << "There is no device supporting HIP\n";
@ -470,12 +459,10 @@ void UCL_Device::print_all(std::ostream &out) {
out << "No\n";
out << " Total amount of global memory: "
<< gigabytes(i) << " GB\n";
//#if CUDA_VERSION >= 2000
out << " Number of compute units/multiprocessors: "
<< _properties[i].multiProcessorCount << std::endl;
out << " Number of cores: "
<< cores(i) << std::endl;
//#endif
out << " Total amount of constant memory: "
<< _properties[i].totalConstantMemory << " bytes\n";
out << " Total amount of local/shared memory per block: "
@ -494,58 +481,29 @@ void UCL_Device::print_all(std::ostream &out) {
<< _properties[i].maxGridSize[0] << " x "
<< _properties[i].maxGridSize[1] << " x "
<< _properties[i].maxGridSize[2] << std::endl;
//out << " Maximum memory pitch: "
// << max_pitch(i) << " bytes\n";
//out << " Texture alignment: "
// << _properties[i].textureAlign << " bytes\n";
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
//#if CUDA_VERSION >= 2020
//out << " Run time limit on kernels: ";
//if (_properties[i].kernelExecTimeoutEnabled)
// out << "Yes\n";
//else
// out << "No\n";
out << " Integrated: ";
if (_properties[i].integrated)
out << "Yes\n";
else
out << "No\n";
//out << " Support host page-locked memory mapping: ";
//if (_properties[i].canMapHostMemory)
// out << "Yes\n";
//else
// out << "No\n";
out << " Compute mode: ";
if (_properties[i].computeMode == hipComputeModeDefault)
out << "Default\n"; // multiple threads can use device
//#if CUDA_VERSION >= 8000
// else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
//#else
else if (_properties[i].computeMode == hipComputeModeExclusive)
//#endif
out << "Exclusive\n"; // only thread can use device
else if (_properties[i].computeMode == hipComputeModeProhibited)
out << "Prohibited\n"; // no thread can use device
//#if CUDART_VERSION >= 4000
else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
out << "Exclusive Process\n"; // multiple threads 1 process
//#endif
else
out << "Unknown\n";
//#endif
//#if CUDA_VERSION >= 3010
out << " Concurrent kernel execution: ";
if (_properties[i].concurrentKernels)
out << "Yes\n";
else
out << "No\n";
//out << " Device has ECC support enabled: ";
//if (_properties[i].ECCEnabled)
// out << "Yes\n";
//else
// out << "No\n";
//#endif
}
}

View File

@ -5,11 +5,7 @@
#include <cassert>
#include <hip/hip_runtime.h>
//#if CUDA_VERSION >= 3020
#define CUDA_INT_TYPE size_t
//#else
//#define CUDA_INT_TYPE unsigned
//#endif
#ifdef MPI_GERYON
#include "mpi.h"

View File

@ -71,9 +71,6 @@ class UCL_Texture {
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &) {
//#if CUDA_VERSION < 4000
//CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
//#endif
}
private:

View File

@ -320,6 +320,9 @@ class UCL_Device {
// Grabs the properties for all devices
UCL_Device::UCL_Device() {
#if CUDA_VERSION < 8000
#error CUDA Toolkit version 8 or later required
#endif
CU_SAFE_CALL_NS(cuInit(0));
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
for (int i=0; i<_num_devices; ++i) {
@ -358,16 +361,12 @@ UCL_Device::UCL_Device() {
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
#if CUDA_VERSION >= 2020
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
#endif
#if CUDA_VERSION >= 3010
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
#endif
_properties.push_back(prop);
}
@ -415,13 +414,10 @@ void UCL_Device::clear() {
// List all devices along with all properties
void UCL_Device::print_all(std::ostream &out) {
#if CUDA_VERSION >= 2020
int driver_version;
cuDriverGetVersion(&driver_version);
out << "CUDA Driver Version: "
<< driver_version/1000 << "." << driver_version%100
<< std::endl;
#endif
<< driver_version/1000 << "." << driver_version%100 << std::endl;
if (num_devices() == 0)
out << "There is no device supporting CUDA\n";
@ -438,12 +434,10 @@ void UCL_Device::print_all(std::ostream &out) {
out << "No\n";
out << " Total amount of global memory: "
<< gigabytes(i) << " GB\n";
#if CUDA_VERSION >= 2000
out << " Number of compute units/multiprocessors: "
<< _properties[i].multiProcessorCount << std::endl;
out << " Number of cores: "
<< cores(i) << std::endl;
#endif
out << " Total amount of constant memory: "
<< _properties[i].totalConstantMemory << " bytes\n";
out << " Total amount of local/shared memory per block: "
@ -468,7 +462,6 @@ void UCL_Device::print_all(std::ostream &out) {
<< _properties[i].textureAlign << " bytes\n";
out << " Clock rate: "
<< clock_rate(i) << " GHz\n";
#if CUDA_VERSION >= 2020
out << " Run time limit on kernels: ";
if (_properties[i].kernelExecTimeoutEnabled)
out << "Yes\n";
@ -487,22 +480,14 @@ void UCL_Device::print_all(std::ostream &out) {
out << " Compute mode: ";
if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT)
out << "Default\n"; // multiple threads can use device
#if CUDA_VERSION >= 8000
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
#else
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE)
#endif
out << "Exclusive\n"; // only thread can use device
else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED)
out << "Prohibited\n"; // no thread can use device
#if CUDART_VERSION >= 4000
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
out << "Exclusive Process\n"; // multiple threads 1 process
#endif
else
out << "Unknown\n";
#endif
#if CUDA_VERSION >= 3010
out << " Concurrent kernel execution: ";
if (_properties[i].concurrentKernels)
out << "Yes\n";
@ -513,7 +498,6 @@ void UCL_Device::print_all(std::ostream &out) {
out << "Yes\n";
else
out << "No\n";
#endif
}
}

View File

@ -165,17 +165,11 @@ class UCL_Program {
class UCL_Kernel {
public:
UCL_Kernel() : _dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
_num_blocks[0]=0;
}
UCL_Kernel(UCL_Program &program, const char *function) :
_dimensions(1), _num_args(0) {
#if CUDA_VERSION < 4000
_param_size=0;
#endif
_num_blocks[0]=0;
set_function(program,function);
_cq=program._cq;
@ -211,11 +205,7 @@ class UCL_Kernel {
if (index==_num_args)
add_arg(arg);
else if (index<_num_args)
#if CUDA_VERSION >= 4000
_kernel_args[index]=arg;
#else
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
#endif
else
assert(0==1); // Must add kernel parameters in sequential order
}
@ -242,15 +232,7 @@ class UCL_Kernel {
/// Add a kernel argument.
inline void add_arg(const CUdeviceptr* const arg) {
#if CUDA_VERSION >= 4000
_kernel_args[_num_args]=(void *)arg;
#else
void* ptr = (void*)(size_t)(*arg);
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
_offsets.push_back(_param_size);
_param_size+=sizeof(ptr);
#endif
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
@ -258,14 +240,7 @@ class UCL_Kernel {
/// Add a kernel argument.
template <class dtype>
inline void add_arg(const dtype* const arg) {
#if CUDA_VERSION >= 4000
_kernel_args[_num_args]=const_cast<dtype *>(arg);
#else
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
_offsets.push_back(_param_size);
_param_size+=sizeof(dtype);
#endif
_num_args++;
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
}
@ -298,13 +273,9 @@ class UCL_Kernel {
_num_blocks[0]=num_blocks;
_num_blocks[1]=1;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size;
_block_size[1]=1;
_block_size[2]=1;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
#endif
}
/// Set the number of thread blocks and the number of threads in each block
@ -323,13 +294,9 @@ class UCL_Kernel {
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=1;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
#endif
}
/// Set the number of thread blocks and the number of threads in each block
@ -350,14 +317,9 @@ class UCL_Kernel {
_num_blocks[0]=num_blocks_x;
_num_blocks[1]=num_blocks_y;
_num_blocks[2]=1;
#if CUDA_VERSION >= 4000
_block_size[0]=block_size_x;
_block_size[1]=block_size_y;
_block_size[2]=block_size_z;
#else
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
block_size_z));
#endif
}
/// Set the number of thread blocks and the number of threads in each block
@ -373,23 +335,14 @@ class UCL_Kernel {
/// Run the kernel in the default command queue
inline void run() {
#if CUDA_VERSION >= 4000
CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
_num_blocks[2],_block_size[0],_block_size[1],
_block_size[2],0,_cq,_kernel_args,nullptr));
#else
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
#endif
}
/// Clear any arguments associated with the kernel
inline void clear_args() {
_num_args=0;
#if CUDA_VERSION < 4000
_offsets.clear();
_param_size=0;
#endif
}
/// Return the default command queue/stream associated with this data
@ -406,13 +359,8 @@ class UCL_Kernel {
unsigned _num_args;
friend class UCL_Texture;
#if CUDA_VERSION >= 4000
unsigned _block_size[3];
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
#else
std::vector<unsigned> _offsets;
unsigned _param_size;
#endif
};
} // namespace

View File

@ -5,11 +5,7 @@
#include <cassert>
#include <cuda.h>
#if CUDA_VERSION >= 3020
#define CUDA_INT_TYPE size_t
#else
#define CUDA_INT_TYPE unsigned
#endif
#ifdef MPI_GERYON
#include "mpi.h"

View File

@ -69,9 +69,6 @@ class UCL_Texture {
/// Make a texture reference available to kernel
inline void allow(UCL_Kernel &kernel) {
#if CUDA_VERSION < 4000
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
#endif
}
private:

View File

@ -25,21 +25,8 @@
#ifndef UCL_NV_KERNEL_H
#define UCL_NV_KERNEL_H
#if (__CUDA_ARCH__ < 200)
#define mul24 __mul24
#define MEM_THREADS 16
#else
#define mul24(X,Y) (X)*(Y)
#define MEM_THREADS 32
#endif
#ifdef CUDA_PRE_THREE
struct __builtin_align__(16) _double4
{
double x, y, z, w;
};
typedef struct _double4 double4;
#endif
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)