lammps/lib/kokkos/TPL/cub/util_device.cuh

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * \file
 * Properties of a given CUDA device and the corresponding PTX bundle
 */

#pragma once

#include "util_arch.cuh"
#include "util_debug.cuh"
#include "util_namespace.cuh"
#include "util_macro.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/**
 * \addtogroup UtilModule
 * @{
 */

#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


/**
 * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
 */
template <typename T>
__global__ void EmptyKernel(void) { }


/**
 * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
 */
template <int ALLOCATIONS>
__host__ __device__ __forceinline__
cudaError_t AliasTemporaries(
    void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
{
    const int ALIGN_BYTES   = 256;
    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);

    // Compute exclusive prefix sum over allocation requests
    size_t bytes_needed = 0;
    for (int i = 0; i < ALLOCATIONS; ++i)
    {
        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
        allocation_sizes[i] = bytes_needed;
        bytes_needed += allocation_bytes;
    }

    // Check if the caller is simply requesting the size of the storage allocation
    if (!d_temp_storage)
    {
        temp_storage_bytes = bytes_needed;
        return cudaSuccess;
    }

    // Check if enough storage provided
    if (temp_storage_bytes < bytes_needed)
    {
        return CubDebug(cudaErrorMemoryAllocation);
    }

    // Alias
    for (int i = 0; i < ALLOCATIONS; ++i)
    {
        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_sizes[i];
    }

    return cudaSuccess;
}


#endif  // DOXYGEN_SHOULD_SKIP_THIS


/**
 * \brief Retrieves the PTX version (major * 100 + minor * 10)
 */
__host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
{
#ifndef CUB_RUNTIME_ENABLED

    // CUDA API calls not supported from this device
    return cudaErrorInvalidConfiguration;

#else

    cudaError_t error = cudaSuccess;
    do
    {
        cudaFuncAttributes empty_kernel_attrs;
        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
        ptx_version = empty_kernel_attrs.ptxVersion * 10;
    }
    while (0);

    return error;

#endif
}


/**
 * Synchronize the stream if specified
 */
__host__ __device__ __forceinline__
static cudaError_t SyncStream(cudaStream_t stream)
{
#ifndef __CUDA_ARCH__
    return cudaStreamSynchronize(stream);
#else
    // Device can't yet sync on a specific stream
    return cudaDeviceSynchronize();
#endif
}


/**
 * \brief Properties of a given CUDA device and the corresponding PTX bundle
 */
class Device
{
private:

    /// Type definition of the EmptyKernel kernel entry point
    typedef void (*EmptyKernelPtr)();

    /// Force EmptyKernel<void> to be generated if this class is used
    __host__ __device__ __forceinline__
    EmptyKernelPtr Empty()
    {
        return EmptyKernel<void>;
    }

public:

    // Version information
    int     sm_version;             ///< SM version of target device (SM version X.YZ in XYZ integer form)
    int     ptx_version;            ///< Bundled PTX version for target device (PTX version X.YZ in XYZ integer form)

    // Target device properties
    int     sm_count;               ///< Number of SMs
    int     warp_threads;           ///< Number of threads per warp
    int     smem_bank_bytes;        ///< Number of bytes per SM bank
    int     smem_banks;             ///< Number of smem banks
    int     smem_bytes;             ///< Smem bytes per SM
    int     smem_alloc_unit;        ///< Smem segment size
    bool    regs_by_block;          ///< Whether registers are allocated by threadblock (or by warp)
    int     reg_alloc_unit;         ///< Granularity of register allocation within the SM
    int     warp_alloc_unit;        ///< Granularity of warp allocation within the SM
    int     max_sm_threads;         ///< Maximum number of threads per SM
    int     max_sm_blocks;          ///< Maximum number of threadblocks per SM
    int     max_block_threads;      ///< Maximum number of threads per threadblock
    int     max_sm_registers;       ///< Maximum number of registers per SM
    int     max_sm_warps;           ///< Maximum number of warps per SM

    /**
     * Callback for initializing device properties
     */
    template <typename ArchProps>
    __host__ __device__ __forceinline__ void Callback()
    {
        warp_threads        = ArchProps::WARP_THREADS;
        smem_bank_bytes     = ArchProps::SMEM_BANK_BYTES;
        smem_banks          = ArchProps::SMEM_BANKS;
        smem_bytes          = ArchProps::SMEM_BYTES;
        smem_alloc_unit     = ArchProps::SMEM_ALLOC_UNIT;
        regs_by_block       = ArchProps::REGS_BY_BLOCK;
        reg_alloc_unit      = ArchProps::REG_ALLOC_UNIT;
        warp_alloc_unit     = ArchProps::WARP_ALLOC_UNIT;
        max_sm_threads      = ArchProps::MAX_SM_THREADS;
        max_sm_blocks       = ArchProps::MAX_SM_THREADBLOCKS;
        max_block_threads   = ArchProps::MAX_BLOCK_THREADS;
        max_sm_registers    = ArchProps::MAX_SM_REGISTERS;
        max_sm_warps        = max_sm_threads / warp_threads;
    }


public:

    /**
     * Initializer.  Properties are retrieved for the specified GPU ordinal.
     */
    __host__ __device__ __forceinline__
    cudaError_t Init(int device_ordinal)
    {
    #ifndef CUB_RUNTIME_ENABLED

        // CUDA API calls not supported from this device
        return CubDebug(cudaErrorInvalidConfiguration);

    #else

        cudaError_t error = cudaSuccess;
        do
        {
            // Fill in SM version
            int major, minor;
            if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
            if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
            sm_version = major * 100 + minor * 10;

            // Fill in static SM properties
            // Initialize our device properties via callback from static device properties
            ArchProps<100>::Callback(*this, sm_version);

            // Fill in SM count
            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;

            // Fill in PTX version
        #if CUB_PTX_ARCH > 0
            ptx_version = CUB_PTX_ARCH;
        #else
            if (CubDebug(error = PtxVersion(ptx_version))) break;
        #endif

        }
        while (0);

        return error;

    #endif
    }


    /**
     * Initializer.  Properties are retrieved for the current GPU ordinal.
     */
    __host__ __device__ __forceinline__
    cudaError_t Init()
    {
    #ifndef CUB_RUNTIME_ENABLED

        // CUDA API calls not supported from this device
        return CubDebug(cudaErrorInvalidConfiguration);

    #else

        cudaError_t error = cudaSuccess;
        do
        {
            int device_ordinal;
            if ((error = CubDebug(cudaGetDevice(&device_ordinal)))) break;
            if ((error = Init(device_ordinal))) break;
        }
        while (0);
        return error;

    #endif
    }


    /**
     * Computes maximum SM occupancy in thread blocks for the given kernel
     */
    template <typename KernelPtr>
    __host__ __device__ __forceinline__
    cudaError_t MaxSmOccupancy(
        int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
        KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
        int                 block_threads)              ///< [in] Number of threads per thread block
    {
    #ifndef CUB_RUNTIME_ENABLED

        // CUDA API calls not supported from this device
        return CubDebug(cudaErrorInvalidConfiguration);

    #else

        cudaError_t error = cudaSuccess;
        do
        {
            // Get kernel attributes
            cudaFuncAttributes kernel_attrs;
            if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;

            // Number of warps per threadblock
            int block_warps = (block_threads +  warp_threads - 1) / warp_threads;

            // Max warp occupancy
            int max_warp_occupancy = (block_warps > 0) ?
                max_sm_warps / block_warps :
                max_sm_blocks;

            // Maximum register occupancy
            int max_reg_occupancy;
            if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
            {
                // Prevent divide-by-zero
                max_reg_occupancy = max_sm_blocks;
            }
            else if (regs_by_block)
            {
                // Allocates registers by threadblock
                int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
                max_reg_occupancy = max_sm_registers / block_regs;
            }
            else
            {
                // Allocates registers by warp
                int sm_sides                = warp_alloc_unit;
                int sm_registers_per_side   = max_sm_registers / sm_sides;
                int regs_per_warp           = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
                int warps_per_side          = sm_registers_per_side / regs_per_warp;
                int warps                   = warps_per_side * sm_sides;
                max_reg_occupancy           = warps / block_warps;
            }

            // Shared memory per threadblock
            int block_allocated_smem = CUB_ROUND_UP_NEAREST(
                kernel_attrs.sharedSizeBytes,
                smem_alloc_unit);

            // Max shared memory occupancy
            int max_smem_occupancy = (block_allocated_smem > 0) ?
                (smem_bytes / block_allocated_smem) :
                max_sm_blocks;

            // Max occupancy
            max_sm_occupancy = CUB_MIN(
                CUB_MIN(max_sm_blocks, max_warp_occupancy),
                CUB_MIN(max_smem_occupancy, max_reg_occupancy));

//            printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d)", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);

        } while (0);

        return error;

    #endif
    }

};


/** @} */       // end group UtilModule

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)