379 lines
13 KiB
Plaintext
379 lines
13 KiB
Plaintext
/******************************************************************************
|
|
* Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
* names of its contributors may be used to endorse or promote products
|
|
* derived from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
******************************************************************************/
|
|
|
|
/**
|
|
* \file
|
|
* Properties of a given CUDA device and the corresponding PTX bundle
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "util_arch.cuh"
|
|
#include "util_debug.cuh"
|
|
#include "util_namespace.cuh"
|
|
#include "util_macro.cuh"
|
|
|
|
/// Optional outer namespace(s)
|
|
CUB_NS_PREFIX
|
|
|
|
/// CUB namespace
|
|
namespace cub {
|
|
|
|
|
|
/**
|
|
* \addtogroup UtilModule
|
|
* @{
|
|
*/
|
|
|
|
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
|
|
|
|
|
|
/**
|
|
* Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
|
|
*/
|
|
template <typename T>
|
|
__global__ void EmptyKernel(void) { }
|
|
|
|
|
|
/**
|
|
* Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
|
|
*/
|
|
template <int ALLOCATIONS>
|
|
__host__ __device__ __forceinline__
|
|
cudaError_t AliasTemporaries(
|
|
void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
|
|
size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation
|
|
void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed
|
|
size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed
|
|
{
|
|
const int ALIGN_BYTES = 256;
|
|
const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
|
|
|
|
// Compute exclusive prefix sum over allocation requests
|
|
size_t bytes_needed = 0;
|
|
for (int i = 0; i < ALLOCATIONS; ++i)
|
|
{
|
|
size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
|
|
allocation_sizes[i] = bytes_needed;
|
|
bytes_needed += allocation_bytes;
|
|
}
|
|
|
|
// Check if the caller is simply requesting the size of the storage allocation
|
|
if (!d_temp_storage)
|
|
{
|
|
temp_storage_bytes = bytes_needed;
|
|
return cudaSuccess;
|
|
}
|
|
|
|
// Check if enough storage provided
|
|
if (temp_storage_bytes < bytes_needed)
|
|
{
|
|
return CubDebug(cudaErrorMemoryAllocation);
|
|
}
|
|
|
|
// Alias
|
|
for (int i = 0; i < ALLOCATIONS; ++i)
|
|
{
|
|
allocations[i] = static_cast<char*>(d_temp_storage) + allocation_sizes[i];
|
|
}
|
|
|
|
return cudaSuccess;
|
|
}
|
|
|
|
|
|
|
|
#endif // DOXYGEN_SHOULD_SKIP_THIS
|
|
|
|
|
|
|
|
/**
|
|
* \brief Retrieves the PTX version (major * 100 + minor * 10)
|
|
*/
|
|
__host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
|
|
{
|
|
#ifndef CUB_RUNTIME_ENABLED
|
|
|
|
// CUDA API calls not supported from this device
|
|
return cudaErrorInvalidConfiguration;
|
|
|
|
#else
|
|
|
|
cudaError_t error = cudaSuccess;
|
|
do
|
|
{
|
|
cudaFuncAttributes empty_kernel_attrs;
|
|
if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
|
|
ptx_version = empty_kernel_attrs.ptxVersion * 10;
|
|
}
|
|
while (0);
|
|
|
|
return error;
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
/**
|
|
* Synchronize the stream if specified
|
|
*/
|
|
__host__ __device__ __forceinline__
|
|
static cudaError_t SyncStream(cudaStream_t stream)
|
|
{
|
|
#ifndef __CUDA_ARCH__
|
|
return cudaStreamSynchronize(stream);
|
|
#else
|
|
// Device can't yet sync on a specific stream
|
|
return cudaDeviceSynchronize();
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* \brief Properties of a given CUDA device and the corresponding PTX bundle
|
|
*/
|
|
class Device
|
|
{
|
|
private:
|
|
|
|
/// Type definition of the EmptyKernel kernel entry point
|
|
typedef void (*EmptyKernelPtr)();
|
|
|
|
/// Force EmptyKernel<void> to be generated if this class is used
|
|
__host__ __device__ __forceinline__
|
|
EmptyKernelPtr Empty()
|
|
{
|
|
return EmptyKernel<void>;
|
|
}
|
|
|
|
public:
|
|
|
|
// Version information
|
|
int sm_version; ///< SM version of target device (SM version X.YZ in XYZ integer form)
|
|
int ptx_version; ///< Bundled PTX version for target device (PTX version X.YZ in XYZ integer form)
|
|
|
|
// Target device properties
|
|
int sm_count; ///< Number of SMs
|
|
int warp_threads; ///< Number of threads per warp
|
|
int smem_bank_bytes; ///< Number of bytes per SM bank
|
|
int smem_banks; ///< Number of smem banks
|
|
int smem_bytes; ///< Smem bytes per SM
|
|
int smem_alloc_unit; ///< Smem segment size
|
|
bool regs_by_block; ///< Whether registers are allocated by threadblock (or by warp)
|
|
int reg_alloc_unit; ///< Granularity of register allocation within the SM
|
|
int warp_alloc_unit; ///< Granularity of warp allocation within the SM
|
|
int max_sm_threads; ///< Maximum number of threads per SM
|
|
int max_sm_blocks; ///< Maximum number of threadblocks per SM
|
|
int max_block_threads; ///< Maximum number of threads per threadblock
|
|
int max_sm_registers; ///< Maximum number of registers per SM
|
|
int max_sm_warps; ///< Maximum number of warps per SM
|
|
|
|
/**
|
|
* Callback for initializing device properties
|
|
*/
|
|
template <typename ArchProps>
|
|
__host__ __device__ __forceinline__ void Callback()
|
|
{
|
|
warp_threads = ArchProps::WARP_THREADS;
|
|
smem_bank_bytes = ArchProps::SMEM_BANK_BYTES;
|
|
smem_banks = ArchProps::SMEM_BANKS;
|
|
smem_bytes = ArchProps::SMEM_BYTES;
|
|
smem_alloc_unit = ArchProps::SMEM_ALLOC_UNIT;
|
|
regs_by_block = ArchProps::REGS_BY_BLOCK;
|
|
reg_alloc_unit = ArchProps::REG_ALLOC_UNIT;
|
|
warp_alloc_unit = ArchProps::WARP_ALLOC_UNIT;
|
|
max_sm_threads = ArchProps::MAX_SM_THREADS;
|
|
max_sm_blocks = ArchProps::MAX_SM_THREADBLOCKS;
|
|
max_block_threads = ArchProps::MAX_BLOCK_THREADS;
|
|
max_sm_registers = ArchProps::MAX_SM_REGISTERS;
|
|
max_sm_warps = max_sm_threads / warp_threads;
|
|
}
|
|
|
|
|
|
public:
|
|
|
|
/**
|
|
* Initializer. Properties are retrieved for the specified GPU ordinal.
|
|
*/
|
|
__host__ __device__ __forceinline__
|
|
cudaError_t Init(int device_ordinal)
|
|
{
|
|
#ifndef CUB_RUNTIME_ENABLED
|
|
|
|
// CUDA API calls not supported from this device
|
|
return CubDebug(cudaErrorInvalidConfiguration);
|
|
|
|
#else
|
|
|
|
cudaError_t error = cudaSuccess;
|
|
do
|
|
{
|
|
// Fill in SM version
|
|
int major, minor;
|
|
if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
|
|
if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
|
|
sm_version = major * 100 + minor * 10;
|
|
|
|
// Fill in static SM properties
|
|
// Initialize our device properties via callback from static device properties
|
|
ArchProps<100>::Callback(*this, sm_version);
|
|
|
|
// Fill in SM count
|
|
if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
|
|
|
|
// Fill in PTX version
|
|
#if CUB_PTX_ARCH > 0
|
|
ptx_version = CUB_PTX_ARCH;
|
|
#else
|
|
if (CubDebug(error = PtxVersion(ptx_version))) break;
|
|
#endif
|
|
|
|
}
|
|
while (0);
|
|
|
|
return error;
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
/**
|
|
* Initializer. Properties are retrieved for the current GPU ordinal.
|
|
*/
|
|
__host__ __device__ __forceinline__
|
|
cudaError_t Init()
|
|
{
|
|
#ifndef CUB_RUNTIME_ENABLED
|
|
|
|
// CUDA API calls not supported from this device
|
|
return CubDebug(cudaErrorInvalidConfiguration);
|
|
|
|
#else
|
|
|
|
cudaError_t error = cudaSuccess;
|
|
do
|
|
{
|
|
int device_ordinal;
|
|
if ((error = CubDebug(cudaGetDevice(&device_ordinal)))) break;
|
|
if ((error = Init(device_ordinal))) break;
|
|
}
|
|
while (0);
|
|
return error;
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
/**
|
|
* Computes maximum SM occupancy in thread blocks for the given kernel
|
|
*/
|
|
template <typename KernelPtr>
|
|
__host__ __device__ __forceinline__
|
|
cudaError_t MaxSmOccupancy(
|
|
int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
|
|
KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
|
|
int block_threads) ///< [in] Number of threads per thread block
|
|
{
|
|
#ifndef CUB_RUNTIME_ENABLED
|
|
|
|
// CUDA API calls not supported from this device
|
|
return CubDebug(cudaErrorInvalidConfiguration);
|
|
|
|
#else
|
|
|
|
cudaError_t error = cudaSuccess;
|
|
do
|
|
{
|
|
// Get kernel attributes
|
|
cudaFuncAttributes kernel_attrs;
|
|
if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
|
|
|
|
// Number of warps per threadblock
|
|
int block_warps = (block_threads + warp_threads - 1) / warp_threads;
|
|
|
|
// Max warp occupancy
|
|
int max_warp_occupancy = (block_warps > 0) ?
|
|
max_sm_warps / block_warps :
|
|
max_sm_blocks;
|
|
|
|
// Maximum register occupancy
|
|
int max_reg_occupancy;
|
|
if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
|
|
{
|
|
// Prevent divide-by-zero
|
|
max_reg_occupancy = max_sm_blocks;
|
|
}
|
|
else if (regs_by_block)
|
|
{
|
|
// Allocates registers by threadblock
|
|
int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
|
|
max_reg_occupancy = max_sm_registers / block_regs;
|
|
}
|
|
else
|
|
{
|
|
// Allocates registers by warp
|
|
int sm_sides = warp_alloc_unit;
|
|
int sm_registers_per_side = max_sm_registers / sm_sides;
|
|
int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
|
|
int warps_per_side = sm_registers_per_side / regs_per_warp;
|
|
int warps = warps_per_side * sm_sides;
|
|
max_reg_occupancy = warps / block_warps;
|
|
}
|
|
|
|
// Shared memory per threadblock
|
|
int block_allocated_smem = CUB_ROUND_UP_NEAREST(
|
|
kernel_attrs.sharedSizeBytes,
|
|
smem_alloc_unit);
|
|
|
|
// Max shared memory occupancy
|
|
int max_smem_occupancy = (block_allocated_smem > 0) ?
|
|
(smem_bytes / block_allocated_smem) :
|
|
max_sm_blocks;
|
|
|
|
// Max occupancy
|
|
max_sm_occupancy = CUB_MIN(
|
|
CUB_MIN(max_sm_blocks, max_warp_occupancy),
|
|
CUB_MIN(max_smem_occupancy, max_reg_occupancy));
|
|
|
|
// printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d)", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
|
|
|
|
} while (0);
|
|
|
|
return error;
|
|
|
|
#endif
|
|
}
|
|
|
|
};
|
|
|
|
|
|
/** @} */ // end group UtilModule
|
|
|
|
} // CUB namespace
|
|
CUB_NS_POSTFIX // Optional outer namespace(s)
|