Merge pull request #3213 from akohlmey/gpu-remove-legacy
Remove support for CUDA toolkits before version 8 and GPUs older than Kepler
This commit is contained in:
@ -80,14 +80,14 @@ if(GPU_API STREQUAL "CUDA")
|
||||
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
|
||||
set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH}")
|
||||
|
||||
# apply the following to build "fat" CUDA binaries only for known CUDA toolkits
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
message(WARNING "Untested CUDA Toolkit version. Use at your own risk")
|
||||
# apply the following to build "fat" CUDA binaries only for known CUDA toolkits since version 8.0
|
||||
# only the Kepler achitecture and beyond is supported
|
||||
if(CUDA_VERSION VERSION_LESS 8.0)
|
||||
message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
|
||||
elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
|
||||
set(GPU_CUDA_GENCODE "-arch=all")
|
||||
else()
|
||||
# Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
|
||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "3.2") AND (CUDA_VERSION VERSION_LESS "9.0"))
|
||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_20,code=[sm_20,compute_20] ")
|
||||
endif()
|
||||
# Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
|
||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
|
||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30] ")
|
||||
@ -120,6 +120,14 @@ if(GPU_API STREQUAL "CUDA")
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
|
||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
|
||||
endif()
|
||||
# Hopper (GPU Arch 9.0) is supported by CUDA 12.0? and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
|
||||
endif()
|
||||
# # Lovelace (GPU Arch 9.x) is supported by CUDA 12.0? and later
|
||||
#if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
# string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_9x,code=[sm_9x,compute_9x]")
|
||||
#endif()
|
||||
endif()
|
||||
|
||||
cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC}
|
||||
@ -257,43 +265,47 @@ elseif(GPU_API STREQUAL "HIP")
|
||||
find_package(CUDA REQUIRED)
|
||||
set(HIP_ARCH "sm_50" CACHE STRING "HIP primary CUDA architecture (e.g. sm_60)")
|
||||
|
||||
# build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
|
||||
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
|
||||
set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH}")
|
||||
# Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
|
||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "3.2") AND (CUDA_VERSION VERSION_LESS "9.0"))
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_20,code=[sm_20,compute_20]")
|
||||
endif()
|
||||
# Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
|
||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30]")
|
||||
endif()
|
||||
# Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11.0
|
||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
|
||||
endif()
|
||||
# Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
|
||||
endif()
|
||||
# Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
|
||||
endif()
|
||||
# Volta (GPU Arch 7.0) is supported by CUDA 9 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
|
||||
endif()
|
||||
# Turing (GPU Arch 7.5) is supported by CUDA 10 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
|
||||
endif()
|
||||
# Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
|
||||
endif()
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
message(WARNING "Unsupported CUDA version. Use at your own risk.")
|
||||
if(CUDA_VERSION VERSION_LESS 8.0)
|
||||
message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
|
||||
elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
|
||||
set(HIP_CUDA_GENCODE "-arch=all")
|
||||
else()
|
||||
# build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
|
||||
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
|
||||
set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH}")
|
||||
# Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
|
||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30]")
|
||||
endif()
|
||||
# Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11.0
|
||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
|
||||
endif()
|
||||
# Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
|
||||
endif()
|
||||
# Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
|
||||
endif()
|
||||
# Volta (GPU Arch 7.0) is supported by CUDA 9 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
|
||||
endif()
|
||||
# Turing (GPU Arch 7.5) is supported by CUDA 10 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
|
||||
endif()
|
||||
# Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
|
||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
|
||||
endif()
|
||||
# Hopper (GPU Arch 9.0) is supported by CUDA 12.0? and later
|
||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@ -141,8 +141,6 @@ CMake build
|
||||
|
||||
:code:`GPU_ARCH` settings for different GPU hardware is as follows:
|
||||
|
||||
* sm_12 or sm_13 for GT200 (supported by CUDA 3.2 until CUDA 6.5)
|
||||
* sm_20 or sm_21 for Fermi (supported by CUDA 3.2 until CUDA 7.5)
|
||||
* sm_30 for Kepler (supported since CUDA 5 and until CUDA 10.x)
|
||||
* sm_35 or sm_37 for Kepler (supported since CUDA 5 and until CUDA 11.x)
|
||||
* sm_50 or sm_52 for Maxwell (supported since CUDA 6)
|
||||
@ -160,6 +158,12 @@ Thus the GPU_ARCH setting is merely an optimization, to have code for
|
||||
the preferred GPU architecture directly included rather than having to wait
|
||||
for the JIT compiler of the CUDA driver to translate it.
|
||||
|
||||
When compiling for CUDA or HIP with CUDA, version 8.0 or later of the CUDA toolkit
|
||||
is required and a GPU architecture of Kepler or later, which must *also* be
|
||||
supported by the CUDA toolkit in use **and** the CUDA driver in use.
|
||||
When compiling for OpenCL, OpenCL version 1.2 or later is required and the
|
||||
GPU must be supported by the GPU driver and OpenCL runtime bundled with the driver.
|
||||
|
||||
When building with CMake, you **must NOT** build the GPU library in ``lib/gpu``
|
||||
using the traditional build procedure. CMake will detect files generated by that
|
||||
process and will terminate with an error and a suggestion for how to remove them.
|
||||
|
||||
@ -13,14 +13,6 @@ endif
|
||||
|
||||
NVCC = nvcc
|
||||
|
||||
# obsolete hardware. not supported by current drivers anymore.
|
||||
#CUDA_ARCH = -arch=sm_13
|
||||
#CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
|
||||
|
||||
# Fermi hardware
|
||||
#CUDA_ARCH = -arch=sm_20
|
||||
#CUDA_ARCH = -arch=sm_21
|
||||
|
||||
# Kepler hardware
|
||||
#CUDA_ARCH = -arch=sm_30
|
||||
#CUDA_ARCH = -arch=sm_32
|
||||
@ -45,6 +37,9 @@ CUDA_ARCH = -arch=sm_50
|
||||
#CUDA_ARCH = -arch=sm_80
|
||||
#CUDA_ARCH = -arch=sm_86
|
||||
|
||||
# Hopper hardware
|
||||
#CUDA_ARCH = -arch=sm_90
|
||||
|
||||
CUDA_CODE = -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] \
|
||||
-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] \
|
||||
-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_75,code=[sm_75,compute_75] \
|
||||
|
||||
@ -171,7 +171,13 @@ NOTE: when compiling with CMake, all of the considerations listed below
|
||||
are considered within the CMake configuration process, so no separate
|
||||
compilation of the gpu library is required. Also this will build in support
|
||||
for all compute architecture that are supported by the CUDA toolkit version
|
||||
used to build the gpu library.
|
||||
used to build the gpu library. A similar setup is possible using
|
||||
Makefile.linux_multi after adjusting the settings for the CUDA toolkit in use.
|
||||
|
||||
Only CUDA toolkit version 8.0 and later and only GPU architecture 3.0
|
||||
(aka Kepler) and later are supported by this version of LAMMPS. If you want
|
||||
to use older hard- or software you have to compile for OpenCL or use an older
|
||||
version of LAMMPS.
|
||||
|
||||
If you do not want to use a fat binary, that supports multiple CUDA
|
||||
architectures, the CUDA_ARCH must be set to match the GPU architecture. This
|
||||
@ -225,7 +231,8 @@ If GERYON_NUMA_FISSION is defined at build time, LAMMPS will consider separate
|
||||
NUMA nodes on GPUs or accelerators as separate devices. For example, a 2-socket
|
||||
CPU would appear as two separate devices for OpenCL (and LAMMPS would require
|
||||
two MPI processes to use both sockets with the GPU library - each with its
|
||||
own device ID as output by ocl_get_devices).
|
||||
own device ID as output by ocl_get_devices). OpenCL version 1.2 or later is
|
||||
required.
|
||||
|
||||
For a debug build, use "-DUCL_DEBUG -DGERYON_KERNEL_DUMP" and remove
|
||||
"-DUCL_NO_EXIT" and "-DMPI_GERYON" from the build options.
|
||||
|
||||
@ -379,18 +379,9 @@ UCL_Device::UCL_Device() {
|
||||
prop.regsPerBlock = hip_prop.regsPerBlock;
|
||||
prop.clockRate = hip_prop.clockRate;
|
||||
prop.computeMode = hip_prop.computeMode;
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
|
||||
|
||||
//#if CUDA_VERSION >= 2020
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
||||
//#endif
|
||||
//#if CUDA_VERSION >= 3010
|
||||
prop.concurrentKernels = hip_prop.concurrentKernels;
|
||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
||||
//#endif
|
||||
|
||||
_properties.push_back(prop);
|
||||
}
|
||||
@ -447,13 +438,11 @@ void UCL_Device::clear() {
|
||||
|
||||
// List all devices along with all properties
|
||||
void UCL_Device::print_all(std::ostream &out) {
|
||||
//#if CUDA_VERSION >= 2020
|
||||
int driver_version;
|
||||
hipDriverGetVersion(&driver_version);
|
||||
out << "Driver Version: "
|
||||
<< driver_version/1000 << "." << driver_version%100
|
||||
<< std::endl;
|
||||
//#endif
|
||||
|
||||
if (num_devices() == 0)
|
||||
out << "There is no device supporting HIP\n";
|
||||
@ -470,12 +459,10 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << "No\n";
|
||||
out << " Total amount of global memory: "
|
||||
<< gigabytes(i) << " GB\n";
|
||||
//#if CUDA_VERSION >= 2000
|
||||
out << " Number of compute units/multiprocessors: "
|
||||
<< _properties[i].multiProcessorCount << std::endl;
|
||||
out << " Number of cores: "
|
||||
<< cores(i) << std::endl;
|
||||
//#endif
|
||||
out << " Total amount of constant memory: "
|
||||
<< _properties[i].totalConstantMemory << " bytes\n";
|
||||
out << " Total amount of local/shared memory per block: "
|
||||
@ -494,58 +481,29 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
<< _properties[i].maxGridSize[0] << " x "
|
||||
<< _properties[i].maxGridSize[1] << " x "
|
||||
<< _properties[i].maxGridSize[2] << std::endl;
|
||||
//out << " Maximum memory pitch: "
|
||||
// << max_pitch(i) << " bytes\n";
|
||||
//out << " Texture alignment: "
|
||||
// << _properties[i].textureAlign << " bytes\n";
|
||||
out << " Clock rate: "
|
||||
<< clock_rate(i) << " GHz\n";
|
||||
//#if CUDA_VERSION >= 2020
|
||||
//out << " Run time limit on kernels: ";
|
||||
//if (_properties[i].kernelExecTimeoutEnabled)
|
||||
// out << "Yes\n";
|
||||
//else
|
||||
// out << "No\n";
|
||||
out << " Integrated: ";
|
||||
if (_properties[i].integrated)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
//out << " Support host page-locked memory mapping: ";
|
||||
//if (_properties[i].canMapHostMemory)
|
||||
// out << "Yes\n";
|
||||
//else
|
||||
// out << "No\n";
|
||||
out << " Compute mode: ";
|
||||
if (_properties[i].computeMode == hipComputeModeDefault)
|
||||
out << "Default\n"; // multiple threads can use device
|
||||
//#if CUDA_VERSION >= 8000
|
||||
// else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
|
||||
//#else
|
||||
else if (_properties[i].computeMode == hipComputeModeExclusive)
|
||||
//#endif
|
||||
out << "Exclusive\n"; // only thread can use device
|
||||
else if (_properties[i].computeMode == hipComputeModeProhibited)
|
||||
out << "Prohibited\n"; // no thread can use device
|
||||
//#if CUDART_VERSION >= 4000
|
||||
else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
|
||||
out << "Exclusive Process\n"; // multiple threads 1 process
|
||||
//#endif
|
||||
else
|
||||
out << "Unknown\n";
|
||||
//#endif
|
||||
//#if CUDA_VERSION >= 3010
|
||||
out << " Concurrent kernel execution: ";
|
||||
if (_properties[i].concurrentKernels)
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
//out << " Device has ECC support enabled: ";
|
||||
//if (_properties[i].ECCEnabled)
|
||||
// out << "Yes\n";
|
||||
//else
|
||||
// out << "No\n";
|
||||
//#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -5,11 +5,7 @@
|
||||
#include <cassert>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
//#if CUDA_VERSION >= 3020
|
||||
#define CUDA_INT_TYPE size_t
|
||||
//#else
|
||||
//#define CUDA_INT_TYPE unsigned
|
||||
//#endif
|
||||
|
||||
#ifdef MPI_GERYON
|
||||
#include "mpi.h"
|
||||
|
||||
@ -71,9 +71,6 @@ class UCL_Texture {
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &) {
|
||||
//#if CUDA_VERSION < 4000
|
||||
//CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
||||
//#endif
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@ -320,6 +320,9 @@ class UCL_Device {
|
||||
|
||||
// Grabs the properties for all devices
|
||||
UCL_Device::UCL_Device() {
|
||||
#if CUDA_VERSION < 8000
|
||||
#error CUDA Toolkit version 8 or later required
|
||||
#endif
|
||||
CU_SAFE_CALL_NS(cuInit(0));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
|
||||
for (int i=0; i<_num_devices; ++i) {
|
||||
@ -358,16 +361,12 @@ UCL_Device::UCL_Device() {
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
|
||||
|
||||
#if CUDA_VERSION >= 2020
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
|
||||
#endif
|
||||
#if CUDA_VERSION >= 3010
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
|
||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
||||
#endif
|
||||
|
||||
_properties.push_back(prop);
|
||||
}
|
||||
@ -415,13 +414,10 @@ void UCL_Device::clear() {
|
||||
|
||||
// List all devices along with all properties
|
||||
void UCL_Device::print_all(std::ostream &out) {
|
||||
#if CUDA_VERSION >= 2020
|
||||
int driver_version;
|
||||
cuDriverGetVersion(&driver_version);
|
||||
out << "CUDA Driver Version: "
|
||||
<< driver_version/1000 << "." << driver_version%100
|
||||
<< std::endl;
|
||||
#endif
|
||||
<< driver_version/1000 << "." << driver_version%100 << std::endl;
|
||||
|
||||
if (num_devices() == 0)
|
||||
out << "There is no device supporting CUDA\n";
|
||||
@ -438,12 +434,10 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << "No\n";
|
||||
out << " Total amount of global memory: "
|
||||
<< gigabytes(i) << " GB\n";
|
||||
#if CUDA_VERSION >= 2000
|
||||
out << " Number of compute units/multiprocessors: "
|
||||
<< _properties[i].multiProcessorCount << std::endl;
|
||||
out << " Number of cores: "
|
||||
<< cores(i) << std::endl;
|
||||
#endif
|
||||
out << " Total amount of constant memory: "
|
||||
<< _properties[i].totalConstantMemory << " bytes\n";
|
||||
out << " Total amount of local/shared memory per block: "
|
||||
@ -468,7 +462,6 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
<< _properties[i].textureAlign << " bytes\n";
|
||||
out << " Clock rate: "
|
||||
<< clock_rate(i) << " GHz\n";
|
||||
#if CUDA_VERSION >= 2020
|
||||
out << " Run time limit on kernels: ";
|
||||
if (_properties[i].kernelExecTimeoutEnabled)
|
||||
out << "Yes\n";
|
||||
@ -487,22 +480,14 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << " Compute mode: ";
|
||||
if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT)
|
||||
out << "Default\n"; // multiple threads can use device
|
||||
#if CUDA_VERSION >= 8000
|
||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
||||
#else
|
||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE)
|
||||
#endif
|
||||
out << "Exclusive\n"; // only thread can use device
|
||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED)
|
||||
out << "Prohibited\n"; // no thread can use device
|
||||
#if CUDART_VERSION >= 4000
|
||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
||||
out << "Exclusive Process\n"; // multiple threads 1 process
|
||||
#endif
|
||||
else
|
||||
out << "Unknown\n";
|
||||
#endif
|
||||
#if CUDA_VERSION >= 3010
|
||||
out << " Concurrent kernel execution: ";
|
||||
if (_properties[i].concurrentKernels)
|
||||
out << "Yes\n";
|
||||
@ -513,7 +498,6 @@ void UCL_Device::print_all(std::ostream &out) {
|
||||
out << "Yes\n";
|
||||
else
|
||||
out << "No\n";
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -165,17 +165,11 @@ class UCL_Program {
|
||||
class UCL_Kernel {
|
||||
public:
|
||||
UCL_Kernel() : _dimensions(1), _num_args(0) {
|
||||
#if CUDA_VERSION < 4000
|
||||
_param_size=0;
|
||||
#endif
|
||||
_num_blocks[0]=0;
|
||||
}
|
||||
|
||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||
_dimensions(1), _num_args(0) {
|
||||
#if CUDA_VERSION < 4000
|
||||
_param_size=0;
|
||||
#endif
|
||||
_num_blocks[0]=0;
|
||||
set_function(program,function);
|
||||
_cq=program._cq;
|
||||
@ -211,11 +205,7 @@ class UCL_Kernel {
|
||||
if (index==_num_args)
|
||||
add_arg(arg);
|
||||
else if (index<_num_args)
|
||||
#if CUDA_VERSION >= 4000
|
||||
_kernel_args[index]=arg;
|
||||
#else
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
|
||||
#endif
|
||||
else
|
||||
assert(0==1); // Must add kernel parameters in sequential order
|
||||
}
|
||||
@ -242,15 +232,7 @@ class UCL_Kernel {
|
||||
|
||||
/// Add a kernel argument.
|
||||
inline void add_arg(const CUdeviceptr* const arg) {
|
||||
#if CUDA_VERSION >= 4000
|
||||
_kernel_args[_num_args]=(void *)arg;
|
||||
#else
|
||||
void* ptr = (void*)(size_t)(*arg);
|
||||
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
|
||||
_offsets.push_back(_param_size);
|
||||
_param_size+=sizeof(ptr);
|
||||
#endif
|
||||
_num_args++;
|
||||
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
||||
}
|
||||
@ -258,14 +240,7 @@ class UCL_Kernel {
|
||||
/// Add a kernel argument.
|
||||
template <class dtype>
|
||||
inline void add_arg(const dtype* const arg) {
|
||||
#if CUDA_VERSION >= 4000
|
||||
_kernel_args[_num_args]=const_cast<dtype *>(arg);
|
||||
#else
|
||||
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
|
||||
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
|
||||
_offsets.push_back(_param_size);
|
||||
_param_size+=sizeof(dtype);
|
||||
#endif
|
||||
_num_args++;
|
||||
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
||||
}
|
||||
@ -298,13 +273,9 @@ class UCL_Kernel {
|
||||
_num_blocks[0]=num_blocks;
|
||||
_num_blocks[1]=1;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size;
|
||||
_block_size[1]=1;
|
||||
_block_size[2]=1;
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
@ -323,13 +294,9 @@ class UCL_Kernel {
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=1;
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
@ -350,14 +317,9 @@ class UCL_Kernel {
|
||||
_num_blocks[0]=num_blocks_x;
|
||||
_num_blocks[1]=num_blocks_y;
|
||||
_num_blocks[2]=1;
|
||||
#if CUDA_VERSION >= 4000
|
||||
_block_size[0]=block_size_x;
|
||||
_block_size[1]=block_size_y;
|
||||
_block_size[2]=block_size_z;
|
||||
#else
|
||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
|
||||
block_size_z));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Set the number of thread blocks and the number of threads in each block
|
||||
@ -373,23 +335,14 @@ class UCL_Kernel {
|
||||
|
||||
/// Run the kernel in the default command queue
|
||||
inline void run() {
|
||||
#if CUDA_VERSION >= 4000
|
||||
CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
|
||||
_num_blocks[2],_block_size[0],_block_size[1],
|
||||
_block_size[2],0,_cq,_kernel_args,nullptr));
|
||||
#else
|
||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Clear any arguments associated with the kernel
|
||||
inline void clear_args() {
|
||||
_num_args=0;
|
||||
#if CUDA_VERSION < 4000
|
||||
_offsets.clear();
|
||||
_param_size=0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Return the default command queue/stream associated with this data
|
||||
@ -406,13 +359,8 @@ class UCL_Kernel {
|
||||
unsigned _num_args;
|
||||
friend class UCL_Texture;
|
||||
|
||||
#if CUDA_VERSION >= 4000
|
||||
unsigned _block_size[3];
|
||||
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
|
||||
#else
|
||||
std::vector<unsigned> _offsets;
|
||||
unsigned _param_size;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -5,11 +5,7 @@
|
||||
#include <cassert>
|
||||
#include <cuda.h>
|
||||
|
||||
#if CUDA_VERSION >= 3020
|
||||
#define CUDA_INT_TYPE size_t
|
||||
#else
|
||||
#define CUDA_INT_TYPE unsigned
|
||||
#endif
|
||||
|
||||
#ifdef MPI_GERYON
|
||||
#include "mpi.h"
|
||||
|
||||
@ -69,9 +69,6 @@ class UCL_Texture {
|
||||
|
||||
/// Make a texture reference available to kernel
|
||||
inline void allow(UCL_Kernel &kernel) {
|
||||
#if CUDA_VERSION < 4000
|
||||
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@ -25,21 +25,8 @@
|
||||
#ifndef UCL_NV_KERNEL_H
|
||||
#define UCL_NV_KERNEL_H
|
||||
|
||||
#if (__CUDA_ARCH__ < 200)
|
||||
#define mul24 __mul24
|
||||
#define MEM_THREADS 16
|
||||
#else
|
||||
#define mul24(X,Y) (X)*(Y)
|
||||
#define MEM_THREADS 32
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PRE_THREE
|
||||
struct __builtin_align__(16) _double4
|
||||
{
|
||||
double x, y, z, w;
|
||||
};
|
||||
typedef struct _double4 double4;
|
||||
#endif
|
||||
|
||||
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
|
||||
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
|
||||
|
||||
@ -58,49 +58,6 @@
|
||||
#define MAX_BIO_SHARED_TYPES 128
|
||||
#define PPPM_MAX_SPLINE 8
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// LEGACY DEVICE CONFIGURATION
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
#if (__CUDA_ARCH__ < 200)
|
||||
|
||||
#undef CONFIG_ID
|
||||
#define CONFIG_ID 101
|
||||
#define MEM_THREADS 16
|
||||
#undef THREADS_PER_ATOM
|
||||
#define THREADS_PER_ATOM 1
|
||||
#undef THREADS_PER_CHARGE
|
||||
#define THREADS_PER_CHARGE 16
|
||||
#undef BLOCK_PAIR
|
||||
#define BLOCK_PAIR 64
|
||||
#undef BLOCK_BIO_PAIR
|
||||
#define BLOCK_BIO_PAIR 64
|
||||
#undef BLOCK_NBOR_BUILD
|
||||
#define BLOCK_NBOR_BUILD 64
|
||||
#undef MAX_SHARED_TYPES
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#undef SHUFFLE_AVAIL
|
||||
#define SHUFFLE_AVAIL 0
|
||||
|
||||
#elseif (__CUDA_ARCH__ < 300)
|
||||
|
||||
#undef CONFIG_ID
|
||||
#define CONFIG_ID 102
|
||||
#undef BLOCK_PAIR
|
||||
#define BLOCK_PAIR 128
|
||||
#undef BLOCK_BIO_PAIR
|
||||
#define BLOCK_BIO_PAIR 128
|
||||
#undef MAX_SHARED_TYPES
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#undef SHUFFLE_AVAIL
|
||||
#define SHUFFLE_AVAIL 0
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// KERNEL MACROS
|
||||
// -------------------------------------------------------------------------
|
||||
@ -111,12 +68,6 @@
|
||||
|
||||
#define fast_mul(X,Y) (X)*(Y)
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if (__CUDA_ARCH__ < 200)
|
||||
#define fast_mul __mul24
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define EVFLAG 1
|
||||
#define NOUNROLL
|
||||
#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
|
||||
@ -220,14 +171,6 @@
|
||||
// KERNEL MACROS - MATH
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#ifdef CUDA_PRE_THREE
|
||||
struct __builtin_align__(16) _double4
|
||||
{
|
||||
double x, y, z, w;
|
||||
};
|
||||
typedef struct _double4 double4;
|
||||
#endif
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
|
||||
#define ucl_exp exp
|
||||
|
||||
Reference in New Issue
Block a user