Changes from Mike Brown.
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
94
lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
Normal file
94
lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
Normal file
@ -0,0 +1,94 @@
|
||||
// -------------------------------------------------------------
|
||||
// cuDPP -- CUDA Data Parallel Primitives library
|
||||
// -------------------------------------------------------------
|
||||
// $Revision$
|
||||
// $Date$
|
||||
// -------------------------------------------------------------
|
||||
// This source code is distributed under the terms of license.txt
|
||||
// in the root directory of this source distribution.
|
||||
// -------------------------------------------------------------
|
||||
#include "cudpp_maximal_launch.h"
|
||||
|
||||
inline size_t min(size_t x, size_t y)
|
||||
{
|
||||
return (x <= y) ? x : y;
|
||||
}
|
||||
|
||||
inline size_t max(size_t x, size_t y)
|
||||
{
|
||||
return (x >= y) ? x : y;
|
||||
}
|
||||
|
||||
// computes next highest multiple of f from x
|
||||
inline size_t multiple(size_t x, size_t f)
|
||||
{
|
||||
return ((x + (f-1)) / f);
|
||||
}
|
||||
|
||||
|
||||
// MS Excel-style CEIL() function
|
||||
// Rounds x up to nearest multiple of f
|
||||
inline size_t ceiling(size_t x, size_t f)
|
||||
{
|
||||
return multiple(x, f) * f;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
size_t maxBlocks(cudaFuncAttributes &attribs,
|
||||
cudaDeviceProp &devprop,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock)
|
||||
{
|
||||
|
||||
// Determine the maximum number of CTAs that can be run simultaneously for each kernel
|
||||
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
|
||||
const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
|
||||
const unsigned int warpAllocationMultiple = 2;
|
||||
const unsigned int smemAllocationUnit = 512; // in bytes
|
||||
const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
|
||||
const unsigned int maxBlocksPerSM = 8;
|
||||
|
||||
// Number of warps (round up to nearest whole multiple of warp size)
|
||||
size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
|
||||
// Round up to warp allocation multiple
|
||||
numWarps = ceiling(numWarps, warpAllocationMultiple);
|
||||
|
||||
// Number of regs is regs per thread times number of warps times warp size
|
||||
size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
|
||||
// Round up to multiple of register allocation unit size
|
||||
regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
|
||||
|
||||
size_t smemBytes = attribs.sharedSizeBytes + bytesDynamicSharedMem;
|
||||
size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
|
||||
|
||||
size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
|
||||
size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
|
||||
size_t ctaLimitThreads = maxThreadsPerSM / threadsPerBlock;
|
||||
|
||||
return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
size_t maxBlocksFromPointer(void* kernel,
|
||||
size_t bytesDynamicSharedMem,
|
||||
size_t threadsPerBlock)
|
||||
{
|
||||
cudaDeviceProp devprop;
|
||||
int deviceID = -1;
|
||||
cudaError_t err = cudaGetDevice(&deviceID);
|
||||
if (err == cudaSuccess)
|
||||
{
|
||||
err = cudaGetDeviceProperties(&devprop, deviceID);
|
||||
if (err != cudaSuccess)
|
||||
return -1;
|
||||
|
||||
cudaFuncAttributes attr;
|
||||
err = cudaFuncGetAttributes(&attr, (const char*)kernel);
|
||||
if (err != cudaSuccess)
|
||||
return -1;
|
||||
|
||||
return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
Reference in New Issue
Block a user