remove support for CUDA toolkits before version 8 and GPUs older than Kepler
This commit is contained in:
@ -80,14 +80,14 @@ if(GPU_API STREQUAL "CUDA")
|
|||||||
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
|
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
|
||||||
set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH}")
|
set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH}")
|
||||||
|
|
||||||
# apply the following to build "fat" CUDA binaries only for known CUDA toolkits
|
# apply the following to build "fat" CUDA binaries only for known CUDA toolkits since version 8.0
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
# only the Kepler achitecture and beyond is supported
|
||||||
message(WARNING "Untested CUDA Toolkit version. Use at your own risk")
|
if(CUDA_VERSION VERSION_LESS 8.0)
|
||||||
|
message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
|
||||||
|
elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||||
|
message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
|
||||||
|
set(GPU_CUDA_GENCODE "-arch=all")
|
||||||
else()
|
else()
|
||||||
# Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
|
|
||||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "3.2") AND (CUDA_VERSION VERSION_LESS "9.0"))
|
|
||||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_20,code=[sm_20,compute_20] ")
|
|
||||||
endif()
|
|
||||||
# Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
|
# Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
|
||||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
|
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
|
||||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30] ")
|
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30] ")
|
||||||
@ -120,6 +120,14 @@ if(GPU_API STREQUAL "CUDA")
|
|||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
|
||||||
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
|
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_86,code=[sm_86,compute_86]")
|
||||||
endif()
|
endif()
|
||||||
|
# Hopper (GPU Arch 9.0) is supported by CUDA 12.0? and later
|
||||||
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||||
|
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
|
||||||
|
endif()
|
||||||
|
# # Lovelace (GPU Arch 9.x) is supported by CUDA 12.0? and later
|
||||||
|
#if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||||
|
# string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_9x,code=[sm_9x,compute_9x]")
|
||||||
|
#endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC}
|
cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC}
|
||||||
@ -257,43 +265,47 @@ elseif(GPU_API STREQUAL "HIP")
|
|||||||
find_package(CUDA REQUIRED)
|
find_package(CUDA REQUIRED)
|
||||||
set(HIP_ARCH "sm_50" CACHE STRING "HIP primary CUDA architecture (e.g. sm_60)")
|
set(HIP_ARCH "sm_50" CACHE STRING "HIP primary CUDA architecture (e.g. sm_60)")
|
||||||
|
|
||||||
# build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
|
if(CUDA_VERSION VERSION_LESS 8.0)
|
||||||
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
|
message(FATAL_ERROR "CUDA Toolkit version 8.0 or later is required")
|
||||||
set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH}")
|
elseif(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||||
# Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
|
message(WARNING "Untested CUDA Toolkit version ${CUDA_VERSION}. Use at your own risk")
|
||||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "3.2") AND (CUDA_VERSION VERSION_LESS "9.0"))
|
set(HIP_CUDA_GENCODE "-arch=all")
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_20,code=[sm_20,compute_20]")
|
else()
|
||||||
endif()
|
# build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
|
||||||
# Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
|
# --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
|
||||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
|
set(HIP_CUDA_GENCODE "-arch=${HIP_ARCH}")
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30]")
|
# Kepler (GPU Arch 3.0) is supported by CUDA 5 to CUDA 10.2
|
||||||
endif()
|
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "11.0"))
|
||||||
# Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11.0
|
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_30,code=[sm_30,compute_30]")
|
||||||
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
|
endif()
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
|
# Kepler (GPU Arch 3.5) is supported by CUDA 5 to CUDA 11.0
|
||||||
endif()
|
if((CUDA_VERSION VERSION_GREATER_EQUAL "5.0") AND (CUDA_VERSION VERSION_LESS "12.0"))
|
||||||
# Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
|
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_35,code=[sm_35,compute_35]")
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
|
endif()
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
|
# Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
|
||||||
endif()
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "6.0")
|
||||||
# Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
|
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52]")
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
|
endif()
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
|
# Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
|
||||||
endif()
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
|
||||||
# Volta (GPU Arch 7.0) is supported by CUDA 9 and later
|
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61]")
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
|
endif()
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
|
# Volta (GPU Arch 7.0) is supported by CUDA 9 and later
|
||||||
endif()
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
|
||||||
# Turing (GPU Arch 7.5) is supported by CUDA 10 and later
|
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_70,code=[sm_70,compute_70]")
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
|
endif()
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
|
# Turing (GPU Arch 7.5) is supported by CUDA 10 and later
|
||||||
endif()
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
|
||||||
# Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
|
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_75,code=[sm_75,compute_75]")
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
|
endif()
|
||||||
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
|
# Ampere (GPU Arch 8.0) is supported by CUDA 11 and later
|
||||||
endif()
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
|
||||||
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
string(APPEND HIP_CUDA_GENCODE " -gencode arch=compute_80,code=[sm_80,compute_80]")
|
||||||
message(WARNING "Unsupported CUDA version. Use at your own risk.")
|
endif()
|
||||||
|
# Hopper (GPU Arch 9.0) is supported by CUDA 12.0? and later
|
||||||
|
if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
|
||||||
|
string(APPEND GPU_CUDA_GENCODE " -gencode arch=compute_90,code=[sm_90,compute_90]")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@ -141,8 +141,6 @@ CMake build
|
|||||||
|
|
||||||
:code:`GPU_ARCH` settings for different GPU hardware is as follows:
|
:code:`GPU_ARCH` settings for different GPU hardware is as follows:
|
||||||
|
|
||||||
* sm_12 or sm_13 for GT200 (supported by CUDA 3.2 until CUDA 6.5)
|
|
||||||
* sm_20 or sm_21 for Fermi (supported by CUDA 3.2 until CUDA 7.5)
|
|
||||||
* sm_30 for Kepler (supported since CUDA 5 and until CUDA 10.x)
|
* sm_30 for Kepler (supported since CUDA 5 and until CUDA 10.x)
|
||||||
* sm_35 or sm_37 for Kepler (supported since CUDA 5 and until CUDA 11.x)
|
* sm_35 or sm_37 for Kepler (supported since CUDA 5 and until CUDA 11.x)
|
||||||
* sm_50 or sm_52 for Maxwell (supported since CUDA 6)
|
* sm_50 or sm_52 for Maxwell (supported since CUDA 6)
|
||||||
@ -150,6 +148,7 @@ CMake build
|
|||||||
* sm_70 for Volta (supported since CUDA 9)
|
* sm_70 for Volta (supported since CUDA 9)
|
||||||
* sm_75 for Turing (supported since CUDA 10)
|
* sm_75 for Turing (supported since CUDA 10)
|
||||||
* sm_80 for Ampere (supported since CUDA 11)
|
* sm_80 for Ampere (supported since CUDA 11)
|
||||||
|
.. * sm_90 for Hopper (supported since CUDA 12)
|
||||||
|
|
||||||
A more detailed list can be found, for example,
|
A more detailed list can be found, for example,
|
||||||
at `Wikipedia's CUDA article <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_
|
at `Wikipedia's CUDA article <https://en.wikipedia.org/wiki/CUDA#GPUs_supported>`_
|
||||||
@ -160,6 +159,10 @@ Thus the GPU_ARCH setting is merely an optimization, to have code for
|
|||||||
the preferred GPU architecture directly included rather than having to wait
|
the preferred GPU architecture directly included rather than having to wait
|
||||||
for the JIT compiler of the CUDA driver to translate it.
|
for the JIT compiler of the CUDA driver to translate it.
|
||||||
|
|
||||||
|
Version 8.0 or later of the CUDA toolkit is required and a GPU architecture
|
||||||
|
of Kepler or laters, which must *also* be supported by the CUDA toolkit in use
|
||||||
|
**and** the CUDA driver in use.
|
||||||
|
|
||||||
When building with CMake, you **must NOT** build the GPU library in ``lib/gpu``
|
When building with CMake, you **must NOT** build the GPU library in ``lib/gpu``
|
||||||
using the traditional build procedure. CMake will detect files generated by that
|
using the traditional build procedure. CMake will detect files generated by that
|
||||||
process and will terminate with an error and a suggestion for how to remove them.
|
process and will terminate with an error and a suggestion for how to remove them.
|
||||||
|
|||||||
@ -13,14 +13,6 @@ endif
|
|||||||
|
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
|
|
||||||
# obsolete hardware. not supported by current drivers anymore.
|
|
||||||
#CUDA_ARCH = -arch=sm_13
|
|
||||||
#CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
|
|
||||||
|
|
||||||
# Fermi hardware
|
|
||||||
#CUDA_ARCH = -arch=sm_20
|
|
||||||
#CUDA_ARCH = -arch=sm_21
|
|
||||||
|
|
||||||
# Kepler hardware
|
# Kepler hardware
|
||||||
#CUDA_ARCH = -arch=sm_30
|
#CUDA_ARCH = -arch=sm_30
|
||||||
#CUDA_ARCH = -arch=sm_32
|
#CUDA_ARCH = -arch=sm_32
|
||||||
@ -45,6 +37,9 @@ CUDA_ARCH = -arch=sm_50
|
|||||||
#CUDA_ARCH = -arch=sm_80
|
#CUDA_ARCH = -arch=sm_80
|
||||||
#CUDA_ARCH = -arch=sm_86
|
#CUDA_ARCH = -arch=sm_86
|
||||||
|
|
||||||
|
# Hopper hardware
|
||||||
|
#CUDA_ARCH = -arch=sm_90
|
||||||
|
|
||||||
CUDA_CODE = -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] \
|
CUDA_CODE = -gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] \
|
||||||
-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] \
|
-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] \
|
||||||
-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_75,code=[sm_75,compute_75] \
|
-gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_75,code=[sm_75,compute_75] \
|
||||||
|
|||||||
@ -171,7 +171,12 @@ NOTE: when compiling with CMake, all of the considerations listed below
|
|||||||
are considered within the CMake configuration process, so no separate
|
are considered within the CMake configuration process, so no separate
|
||||||
compilation of the gpu library is required. Also this will build in support
|
compilation of the gpu library is required. Also this will build in support
|
||||||
for all compute architecture that are supported by the CUDA toolkit version
|
for all compute architecture that are supported by the CUDA toolkit version
|
||||||
used to build the gpu library.
|
used to build the gpu library. A similar setup is possible using
|
||||||
|
Makefile.linux_multi after adjusting the settings for the CUDA toolkit in use.
|
||||||
|
|
||||||
|
Only CUDA toolkit version 8.0 and later and only GPU architecture 3.0
|
||||||
|
(aka Kepler) and later are supported by this version of LAMMPS. If you want
|
||||||
|
to use older hard- or software you have to use an older version of LAMMPS.
|
||||||
|
|
||||||
If you do not want to use a fat binary, that supports multiple CUDA
|
If you do not want to use a fat binary, that supports multiple CUDA
|
||||||
architectures, the CUDA_ARCH must be set to match the GPU architecture. This
|
architectures, the CUDA_ARCH must be set to match the GPU architecture. This
|
||||||
|
|||||||
@ -379,18 +379,9 @@ UCL_Device::UCL_Device() {
|
|||||||
prop.regsPerBlock = hip_prop.regsPerBlock;
|
prop.regsPerBlock = hip_prop.regsPerBlock;
|
||||||
prop.clockRate = hip_prop.clockRate;
|
prop.clockRate = hip_prop.clockRate;
|
||||||
prop.computeMode = hip_prop.computeMode;
|
prop.computeMode = hip_prop.computeMode;
|
||||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev));
|
|
||||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
|
|
||||||
|
|
||||||
//#if CUDA_VERSION >= 2020
|
|
||||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
|
||||||
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
|
CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.integrated, hipDeviceAttributeIntegrated, dev));
|
||||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
|
||||||
//#endif
|
|
||||||
//#if CUDA_VERSION >= 3010
|
|
||||||
prop.concurrentKernels = hip_prop.concurrentKernels;
|
prop.concurrentKernels = hip_prop.concurrentKernels;
|
||||||
//CU_SAFE_CALL_NS(hipDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
|
||||||
//#endif
|
|
||||||
|
|
||||||
_properties.push_back(prop);
|
_properties.push_back(prop);
|
||||||
}
|
}
|
||||||
@ -447,13 +438,11 @@ void UCL_Device::clear() {
|
|||||||
|
|
||||||
// List all devices along with all properties
|
// List all devices along with all properties
|
||||||
void UCL_Device::print_all(std::ostream &out) {
|
void UCL_Device::print_all(std::ostream &out) {
|
||||||
//#if CUDA_VERSION >= 2020
|
|
||||||
int driver_version;
|
int driver_version;
|
||||||
hipDriverGetVersion(&driver_version);
|
hipDriverGetVersion(&driver_version);
|
||||||
out << "Driver Version: "
|
out << "Driver Version: "
|
||||||
<< driver_version/1000 << "." << driver_version%100
|
<< driver_version/1000 << "." << driver_version%100
|
||||||
<< std::endl;
|
<< std::endl;
|
||||||
//#endif
|
|
||||||
|
|
||||||
if (num_devices() == 0)
|
if (num_devices() == 0)
|
||||||
out << "There is no device supporting HIP\n";
|
out << "There is no device supporting HIP\n";
|
||||||
@ -470,12 +459,10 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||||||
out << "No\n";
|
out << "No\n";
|
||||||
out << " Total amount of global memory: "
|
out << " Total amount of global memory: "
|
||||||
<< gigabytes(i) << " GB\n";
|
<< gigabytes(i) << " GB\n";
|
||||||
//#if CUDA_VERSION >= 2000
|
|
||||||
out << " Number of compute units/multiprocessors: "
|
out << " Number of compute units/multiprocessors: "
|
||||||
<< _properties[i].multiProcessorCount << std::endl;
|
<< _properties[i].multiProcessorCount << std::endl;
|
||||||
out << " Number of cores: "
|
out << " Number of cores: "
|
||||||
<< cores(i) << std::endl;
|
<< cores(i) << std::endl;
|
||||||
//#endif
|
|
||||||
out << " Total amount of constant memory: "
|
out << " Total amount of constant memory: "
|
||||||
<< _properties[i].totalConstantMemory << " bytes\n";
|
<< _properties[i].totalConstantMemory << " bytes\n";
|
||||||
out << " Total amount of local/shared memory per block: "
|
out << " Total amount of local/shared memory per block: "
|
||||||
@ -494,58 +481,29 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||||||
<< _properties[i].maxGridSize[0] << " x "
|
<< _properties[i].maxGridSize[0] << " x "
|
||||||
<< _properties[i].maxGridSize[1] << " x "
|
<< _properties[i].maxGridSize[1] << " x "
|
||||||
<< _properties[i].maxGridSize[2] << std::endl;
|
<< _properties[i].maxGridSize[2] << std::endl;
|
||||||
//out << " Maximum memory pitch: "
|
|
||||||
// << max_pitch(i) << " bytes\n";
|
|
||||||
//out << " Texture alignment: "
|
|
||||||
// << _properties[i].textureAlign << " bytes\n";
|
|
||||||
out << " Clock rate: "
|
out << " Clock rate: "
|
||||||
<< clock_rate(i) << " GHz\n";
|
<< clock_rate(i) << " GHz\n";
|
||||||
//#if CUDA_VERSION >= 2020
|
|
||||||
//out << " Run time limit on kernels: ";
|
|
||||||
//if (_properties[i].kernelExecTimeoutEnabled)
|
|
||||||
// out << "Yes\n";
|
|
||||||
//else
|
|
||||||
// out << "No\n";
|
|
||||||
out << " Integrated: ";
|
out << " Integrated: ";
|
||||||
if (_properties[i].integrated)
|
if (_properties[i].integrated)
|
||||||
out << "Yes\n";
|
out << "Yes\n";
|
||||||
else
|
else
|
||||||
out << "No\n";
|
out << "No\n";
|
||||||
//out << " Support host page-locked memory mapping: ";
|
|
||||||
//if (_properties[i].canMapHostMemory)
|
|
||||||
// out << "Yes\n";
|
|
||||||
//else
|
|
||||||
// out << "No\n";
|
|
||||||
out << " Compute mode: ";
|
out << " Compute mode: ";
|
||||||
if (_properties[i].computeMode == hipComputeModeDefault)
|
if (_properties[i].computeMode == hipComputeModeDefault)
|
||||||
out << "Default\n"; // multiple threads can use device
|
out << "Default\n"; // multiple threads can use device
|
||||||
//#if CUDA_VERSION >= 8000
|
|
||||||
// else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
|
|
||||||
//#else
|
|
||||||
else if (_properties[i].computeMode == hipComputeModeExclusive)
|
else if (_properties[i].computeMode == hipComputeModeExclusive)
|
||||||
//#endif
|
|
||||||
out << "Exclusive\n"; // only thread can use device
|
out << "Exclusive\n"; // only thread can use device
|
||||||
else if (_properties[i].computeMode == hipComputeModeProhibited)
|
else if (_properties[i].computeMode == hipComputeModeProhibited)
|
||||||
out << "Prohibited\n"; // no thread can use device
|
out << "Prohibited\n"; // no thread can use device
|
||||||
//#if CUDART_VERSION >= 4000
|
|
||||||
else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
|
else if (_properties[i].computeMode == hipComputeModeExclusiveProcess)
|
||||||
out << "Exclusive Process\n"; // multiple threads 1 process
|
out << "Exclusive Process\n"; // multiple threads 1 process
|
||||||
//#endif
|
|
||||||
else
|
else
|
||||||
out << "Unknown\n";
|
out << "Unknown\n";
|
||||||
//#endif
|
|
||||||
//#if CUDA_VERSION >= 3010
|
|
||||||
out << " Concurrent kernel execution: ";
|
out << " Concurrent kernel execution: ";
|
||||||
if (_properties[i].concurrentKernels)
|
if (_properties[i].concurrentKernels)
|
||||||
out << "Yes\n";
|
out << "Yes\n";
|
||||||
else
|
else
|
||||||
out << "No\n";
|
out << "No\n";
|
||||||
//out << " Device has ECC support enabled: ";
|
|
||||||
//if (_properties[i].ECCEnabled)
|
|
||||||
// out << "Yes\n";
|
|
||||||
//else
|
|
||||||
// out << "No\n";
|
|
||||||
//#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -5,11 +5,7 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
|
|
||||||
//#if CUDA_VERSION >= 3020
|
|
||||||
#define CUDA_INT_TYPE size_t
|
#define CUDA_INT_TYPE size_t
|
||||||
//#else
|
|
||||||
//#define CUDA_INT_TYPE unsigned
|
|
||||||
//#endif
|
|
||||||
|
|
||||||
#ifdef MPI_GERYON
|
#ifdef MPI_GERYON
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
|
|||||||
@ -71,9 +71,6 @@ class UCL_Texture {
|
|||||||
|
|
||||||
/// Make a texture reference available to kernel
|
/// Make a texture reference available to kernel
|
||||||
inline void allow(UCL_Kernel &) {
|
inline void allow(UCL_Kernel &) {
|
||||||
//#if CUDA_VERSION < 4000
|
|
||||||
//CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
|
||||||
//#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -320,6 +320,9 @@ class UCL_Device {
|
|||||||
|
|
||||||
// Grabs the properties for all devices
|
// Grabs the properties for all devices
|
||||||
UCL_Device::UCL_Device() {
|
UCL_Device::UCL_Device() {
|
||||||
|
#if CUDA_VERSION < 8000
|
||||||
|
#error CUDA Toolkit version 8 or later required
|
||||||
|
#endif
|
||||||
CU_SAFE_CALL_NS(cuInit(0));
|
CU_SAFE_CALL_NS(cuInit(0));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
|
CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
|
||||||
for (int i=0; i<_num_devices; ++i) {
|
for (int i=0; i<_num_devices; ++i) {
|
||||||
@ -358,16 +361,12 @@ UCL_Device::UCL_Device() {
|
|||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev));
|
||||||
|
|
||||||
#if CUDA_VERSION >= 2020
|
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
|
||||||
#endif
|
|
||||||
#if CUDA_VERSION >= 3010
|
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
|
||||||
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
CU_SAFE_CALL_NS(cuDeviceGetAttribute(&prop.ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
|
||||||
#endif
|
|
||||||
|
|
||||||
_properties.push_back(prop);
|
_properties.push_back(prop);
|
||||||
}
|
}
|
||||||
@ -415,13 +414,10 @@ void UCL_Device::clear() {
|
|||||||
|
|
||||||
// List all devices along with all properties
|
// List all devices along with all properties
|
||||||
void UCL_Device::print_all(std::ostream &out) {
|
void UCL_Device::print_all(std::ostream &out) {
|
||||||
#if CUDA_VERSION >= 2020
|
|
||||||
int driver_version;
|
int driver_version;
|
||||||
cuDriverGetVersion(&driver_version);
|
cuDriverGetVersion(&driver_version);
|
||||||
out << "CUDA Driver Version: "
|
out << "CUDA Driver Version: "
|
||||||
<< driver_version/1000 << "." << driver_version%100
|
<< driver_version/1000 << "." << driver_version%100 << std::endl;
|
||||||
<< std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (num_devices() == 0)
|
if (num_devices() == 0)
|
||||||
out << "There is no device supporting CUDA\n";
|
out << "There is no device supporting CUDA\n";
|
||||||
@ -438,12 +434,10 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||||||
out << "No\n";
|
out << "No\n";
|
||||||
out << " Total amount of global memory: "
|
out << " Total amount of global memory: "
|
||||||
<< gigabytes(i) << " GB\n";
|
<< gigabytes(i) << " GB\n";
|
||||||
#if CUDA_VERSION >= 2000
|
|
||||||
out << " Number of compute units/multiprocessors: "
|
out << " Number of compute units/multiprocessors: "
|
||||||
<< _properties[i].multiProcessorCount << std::endl;
|
<< _properties[i].multiProcessorCount << std::endl;
|
||||||
out << " Number of cores: "
|
out << " Number of cores: "
|
||||||
<< cores(i) << std::endl;
|
<< cores(i) << std::endl;
|
||||||
#endif
|
|
||||||
out << " Total amount of constant memory: "
|
out << " Total amount of constant memory: "
|
||||||
<< _properties[i].totalConstantMemory << " bytes\n";
|
<< _properties[i].totalConstantMemory << " bytes\n";
|
||||||
out << " Total amount of local/shared memory per block: "
|
out << " Total amount of local/shared memory per block: "
|
||||||
@ -468,7 +462,6 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||||||
<< _properties[i].textureAlign << " bytes\n";
|
<< _properties[i].textureAlign << " bytes\n";
|
||||||
out << " Clock rate: "
|
out << " Clock rate: "
|
||||||
<< clock_rate(i) << " GHz\n";
|
<< clock_rate(i) << " GHz\n";
|
||||||
#if CUDA_VERSION >= 2020
|
|
||||||
out << " Run time limit on kernels: ";
|
out << " Run time limit on kernels: ";
|
||||||
if (_properties[i].kernelExecTimeoutEnabled)
|
if (_properties[i].kernelExecTimeoutEnabled)
|
||||||
out << "Yes\n";
|
out << "Yes\n";
|
||||||
@ -487,22 +480,14 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||||||
out << " Compute mode: ";
|
out << " Compute mode: ";
|
||||||
if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT)
|
if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT)
|
||||||
out << "Default\n"; // multiple threads can use device
|
out << "Default\n"; // multiple threads can use device
|
||||||
#if CUDA_VERSION >= 8000
|
|
||||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
||||||
#else
|
|
||||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE)
|
|
||||||
#endif
|
|
||||||
out << "Exclusive\n"; // only thread can use device
|
out << "Exclusive\n"; // only thread can use device
|
||||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED)
|
else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED)
|
||||||
out << "Prohibited\n"; // no thread can use device
|
out << "Prohibited\n"; // no thread can use device
|
||||||
#if CUDART_VERSION >= 4000
|
|
||||||
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
||||||
out << "Exclusive Process\n"; // multiple threads 1 process
|
out << "Exclusive Process\n"; // multiple threads 1 process
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
out << "Unknown\n";
|
out << "Unknown\n";
|
||||||
#endif
|
|
||||||
#if CUDA_VERSION >= 3010
|
|
||||||
out << " Concurrent kernel execution: ";
|
out << " Concurrent kernel execution: ";
|
||||||
if (_properties[i].concurrentKernels)
|
if (_properties[i].concurrentKernels)
|
||||||
out << "Yes\n";
|
out << "Yes\n";
|
||||||
@ -513,7 +498,6 @@ void UCL_Device::print_all(std::ostream &out) {
|
|||||||
out << "Yes\n";
|
out << "Yes\n";
|
||||||
else
|
else
|
||||||
out << "No\n";
|
out << "No\n";
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -165,17 +165,11 @@ class UCL_Program {
|
|||||||
class UCL_Kernel {
|
class UCL_Kernel {
|
||||||
public:
|
public:
|
||||||
UCL_Kernel() : _dimensions(1), _num_args(0) {
|
UCL_Kernel() : _dimensions(1), _num_args(0) {
|
||||||
#if CUDA_VERSION < 4000
|
|
||||||
_param_size=0;
|
|
||||||
#endif
|
|
||||||
_num_blocks[0]=0;
|
_num_blocks[0]=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
UCL_Kernel(UCL_Program &program, const char *function) :
|
UCL_Kernel(UCL_Program &program, const char *function) :
|
||||||
_dimensions(1), _num_args(0) {
|
_dimensions(1), _num_args(0) {
|
||||||
#if CUDA_VERSION < 4000
|
|
||||||
_param_size=0;
|
|
||||||
#endif
|
|
||||||
_num_blocks[0]=0;
|
_num_blocks[0]=0;
|
||||||
set_function(program,function);
|
set_function(program,function);
|
||||||
_cq=program._cq;
|
_cq=program._cq;
|
||||||
@ -211,11 +205,7 @@ class UCL_Kernel {
|
|||||||
if (index==_num_args)
|
if (index==_num_args)
|
||||||
add_arg(arg);
|
add_arg(arg);
|
||||||
else if (index<_num_args)
|
else if (index<_num_args)
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
_kernel_args[index]=arg;
|
_kernel_args[index]=arg;
|
||||||
#else
|
|
||||||
CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
|
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
assert(0==1); // Must add kernel parameters in sequential order
|
assert(0==1); // Must add kernel parameters in sequential order
|
||||||
}
|
}
|
||||||
@ -242,15 +232,7 @@ class UCL_Kernel {
|
|||||||
|
|
||||||
/// Add a kernel argument.
|
/// Add a kernel argument.
|
||||||
inline void add_arg(const CUdeviceptr* const arg) {
|
inline void add_arg(const CUdeviceptr* const arg) {
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
_kernel_args[_num_args]=(void *)arg;
|
_kernel_args[_num_args]=(void *)arg;
|
||||||
#else
|
|
||||||
void* ptr = (void*)(size_t)(*arg);
|
|
||||||
_param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
|
|
||||||
CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
|
|
||||||
_offsets.push_back(_param_size);
|
|
||||||
_param_size+=sizeof(ptr);
|
|
||||||
#endif
|
|
||||||
_num_args++;
|
_num_args++;
|
||||||
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
||||||
}
|
}
|
||||||
@ -258,14 +240,7 @@ class UCL_Kernel {
|
|||||||
/// Add a kernel argument.
|
/// Add a kernel argument.
|
||||||
template <class dtype>
|
template <class dtype>
|
||||||
inline void add_arg(const dtype* const arg) {
|
inline void add_arg(const dtype* const arg) {
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
_kernel_args[_num_args]=const_cast<dtype *>(arg);
|
_kernel_args[_num_args]=const_cast<dtype *>(arg);
|
||||||
#else
|
|
||||||
_param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
|
|
||||||
CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
|
|
||||||
_offsets.push_back(_param_size);
|
|
||||||
_param_size+=sizeof(dtype);
|
|
||||||
#endif
|
|
||||||
_num_args++;
|
_num_args++;
|
||||||
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
if (_num_args>UCL_MAX_KERNEL_ARGS) assert(0==1);
|
||||||
}
|
}
|
||||||
@ -298,13 +273,9 @@ class UCL_Kernel {
|
|||||||
_num_blocks[0]=num_blocks;
|
_num_blocks[0]=num_blocks;
|
||||||
_num_blocks[1]=1;
|
_num_blocks[1]=1;
|
||||||
_num_blocks[2]=1;
|
_num_blocks[2]=1;
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
_block_size[0]=block_size;
|
_block_size[0]=block_size;
|
||||||
_block_size[1]=1;
|
_block_size[1]=1;
|
||||||
_block_size[2]=1;
|
_block_size[2]=1;
|
||||||
#else
|
|
||||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the number of thread blocks and the number of threads in each block
|
/// Set the number of thread blocks and the number of threads in each block
|
||||||
@ -323,13 +294,9 @@ class UCL_Kernel {
|
|||||||
_num_blocks[0]=num_blocks_x;
|
_num_blocks[0]=num_blocks_x;
|
||||||
_num_blocks[1]=num_blocks_y;
|
_num_blocks[1]=num_blocks_y;
|
||||||
_num_blocks[2]=1;
|
_num_blocks[2]=1;
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
_block_size[0]=block_size_x;
|
_block_size[0]=block_size_x;
|
||||||
_block_size[1]=block_size_y;
|
_block_size[1]=block_size_y;
|
||||||
_block_size[2]=1;
|
_block_size[2]=1;
|
||||||
#else
|
|
||||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the number of thread blocks and the number of threads in each block
|
/// Set the number of thread blocks and the number of threads in each block
|
||||||
@ -350,14 +317,9 @@ class UCL_Kernel {
|
|||||||
_num_blocks[0]=num_blocks_x;
|
_num_blocks[0]=num_blocks_x;
|
||||||
_num_blocks[1]=num_blocks_y;
|
_num_blocks[1]=num_blocks_y;
|
||||||
_num_blocks[2]=1;
|
_num_blocks[2]=1;
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
_block_size[0]=block_size_x;
|
_block_size[0]=block_size_x;
|
||||||
_block_size[1]=block_size_y;
|
_block_size[1]=block_size_y;
|
||||||
_block_size[2]=block_size_z;
|
_block_size[2]=block_size_z;
|
||||||
#else
|
|
||||||
CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
|
|
||||||
block_size_z));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the number of thread blocks and the number of threads in each block
|
/// Set the number of thread blocks and the number of threads in each block
|
||||||
@ -373,23 +335,14 @@ class UCL_Kernel {
|
|||||||
|
|
||||||
/// Run the kernel in the default command queue
|
/// Run the kernel in the default command queue
|
||||||
inline void run() {
|
inline void run() {
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
|
CU_SAFE_CALL(cuLaunchKernel(_kernel,_num_blocks[0],_num_blocks[1],
|
||||||
_num_blocks[2],_block_size[0],_block_size[1],
|
_num_blocks[2],_block_size[0],_block_size[1],
|
||||||
_block_size[2],0,_cq,_kernel_args,nullptr));
|
_block_size[2],0,_cq,_kernel_args,nullptr));
|
||||||
#else
|
|
||||||
CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
|
|
||||||
CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Clear any arguments associated with the kernel
|
/// Clear any arguments associated with the kernel
|
||||||
inline void clear_args() {
|
inline void clear_args() {
|
||||||
_num_args=0;
|
_num_args=0;
|
||||||
#if CUDA_VERSION < 4000
|
|
||||||
_offsets.clear();
|
|
||||||
_param_size=0;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the default command queue/stream associated with this data
|
/// Return the default command queue/stream associated with this data
|
||||||
@ -406,13 +359,8 @@ class UCL_Kernel {
|
|||||||
unsigned _num_args;
|
unsigned _num_args;
|
||||||
friend class UCL_Texture;
|
friend class UCL_Texture;
|
||||||
|
|
||||||
#if CUDA_VERSION >= 4000
|
|
||||||
unsigned _block_size[3];
|
unsigned _block_size[3];
|
||||||
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
|
void * _kernel_args[UCL_MAX_KERNEL_ARGS];
|
||||||
#else
|
|
||||||
std::vector<unsigned> _offsets;
|
|
||||||
unsigned _param_size;
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|||||||
@ -5,11 +5,7 @@
|
|||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
|
||||||
#if CUDA_VERSION >= 3020
|
|
||||||
#define CUDA_INT_TYPE size_t
|
#define CUDA_INT_TYPE size_t
|
||||||
#else
|
|
||||||
#define CUDA_INT_TYPE unsigned
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef MPI_GERYON
|
#ifdef MPI_GERYON
|
||||||
#include "mpi.h"
|
#include "mpi.h"
|
||||||
|
|||||||
@ -69,9 +69,6 @@ class UCL_Texture {
|
|||||||
|
|
||||||
/// Make a texture reference available to kernel
|
/// Make a texture reference available to kernel
|
||||||
inline void allow(UCL_Kernel &kernel) {
|
inline void allow(UCL_Kernel &kernel) {
|
||||||
#if CUDA_VERSION < 4000
|
|
||||||
CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -25,21 +25,8 @@
|
|||||||
#ifndef UCL_NV_KERNEL_H
|
#ifndef UCL_NV_KERNEL_H
|
||||||
#define UCL_NV_KERNEL_H
|
#define UCL_NV_KERNEL_H
|
||||||
|
|
||||||
#if (__CUDA_ARCH__ < 200)
|
|
||||||
#define mul24 __mul24
|
|
||||||
#define MEM_THREADS 16
|
|
||||||
#else
|
|
||||||
#define mul24(X,Y) (X)*(Y)
|
#define mul24(X,Y) (X)*(Y)
|
||||||
#define MEM_THREADS 32
|
#define MEM_THREADS 32
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CUDA_PRE_THREE
|
|
||||||
struct __builtin_align__(16) _double4
|
|
||||||
{
|
|
||||||
double x, y, z, w;
|
|
||||||
};
|
|
||||||
typedef struct _double4 double4;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
|
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
|
||||||
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
|
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
|
||||||
|
|||||||
@ -58,49 +58,6 @@
|
|||||||
#define MAX_BIO_SHARED_TYPES 128
|
#define MAX_BIO_SHARED_TYPES 128
|
||||||
#define PPPM_MAX_SPLINE 8
|
#define PPPM_MAX_SPLINE 8
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// LEGACY DEVICE CONFIGURATION
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
|
|
||||||
#if (__CUDA_ARCH__ < 200)
|
|
||||||
|
|
||||||
#undef CONFIG_ID
|
|
||||||
#define CONFIG_ID 101
|
|
||||||
#define MEM_THREADS 16
|
|
||||||
#undef THREADS_PER_ATOM
|
|
||||||
#define THREADS_PER_ATOM 1
|
|
||||||
#undef THREADS_PER_CHARGE
|
|
||||||
#define THREADS_PER_CHARGE 16
|
|
||||||
#undef BLOCK_PAIR
|
|
||||||
#define BLOCK_PAIR 64
|
|
||||||
#undef BLOCK_BIO_PAIR
|
|
||||||
#define BLOCK_BIO_PAIR 64
|
|
||||||
#undef BLOCK_NBOR_BUILD
|
|
||||||
#define BLOCK_NBOR_BUILD 64
|
|
||||||
#undef MAX_SHARED_TYPES
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
#undef SHUFFLE_AVAIL
|
|
||||||
#define SHUFFLE_AVAIL 0
|
|
||||||
|
|
||||||
#elseif (__CUDA_ARCH__ < 300)
|
|
||||||
|
|
||||||
#undef CONFIG_ID
|
|
||||||
#define CONFIG_ID 102
|
|
||||||
#undef BLOCK_PAIR
|
|
||||||
#define BLOCK_PAIR 128
|
|
||||||
#undef BLOCK_BIO_PAIR
|
|
||||||
#define BLOCK_BIO_PAIR 128
|
|
||||||
#undef MAX_SHARED_TYPES
|
|
||||||
#define MAX_SHARED_TYPES 8
|
|
||||||
#undef SHUFFLE_AVAIL
|
|
||||||
#define SHUFFLE_AVAIL 0
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// KERNEL MACROS
|
// KERNEL MACROS
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@ -111,12 +68,6 @@
|
|||||||
|
|
||||||
#define fast_mul(X,Y) (X)*(Y)
|
#define fast_mul(X,Y) (X)*(Y)
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
#if (__CUDA_ARCH__ < 200)
|
|
||||||
#define fast_mul __mul24
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define EVFLAG 1
|
#define EVFLAG 1
|
||||||
#define NOUNROLL
|
#define NOUNROLL
|
||||||
#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
|
#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
|
||||||
@ -220,14 +171,6 @@
|
|||||||
// KERNEL MACROS - MATH
|
// KERNEL MACROS - MATH
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
#ifdef CUDA_PRE_THREE
|
|
||||||
struct __builtin_align__(16) _double4
|
|
||||||
{
|
|
||||||
double x, y, z, w;
|
|
||||||
};
|
|
||||||
typedef struct _double4 double4;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _DOUBLE_DOUBLE
|
#ifdef _DOUBLE_DOUBLE
|
||||||
|
|
||||||
#define ucl_exp exp
|
#define ucl_exp exp
|
||||||
|
|||||||
Reference in New Issue
Block a user