lammps/lib/kokkos/TPL/cub/util_allocator.cuh

/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 * Simple caching allocator for device memory allocations. The allocator is
 * thread-safe and capable of managing device allocations on multiple devices.
 ******************************************************************************/

#pragma once

#ifndef __CUDA_ARCH__
    #include <set>              // NVCC (EDG, really) takes FOREVER to compile std::map
    #include <map>
#endif

#include <math.h>

#include "util_namespace.cuh"
#include "util_debug.cuh"

#include "host/spinlock.cuh"

/// Optional outer namespace(s)
CUB_NS_PREFIX

/// CUB namespace
namespace cub {


/**
 * \addtogroup UtilModule
 * @{
 */


/******************************************************************************
 * CachingDeviceAllocator (host use)
 ******************************************************************************/

/**
 * \brief A simple caching allocator for device memory allocations.
 *
 * \par Overview
 * The allocator is thread-safe and is capable of managing cached device allocations
 * on multiple devices.  It behaves as follows:
 *
 * \par
 * - Allocations categorized by bin size.
 * - Bin sizes progress geometrically in accordance with the growth factor
 *   \p bin_growth provided during construction.  Unused device allocations within
 *   a larger bin cache are not reused for allocation requests that categorize to
 *   smaller bin sizes.
 * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
 *   (\p bin_growth ^ \p min_bin).
 * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
 *   bin and are simply freed when they are deallocated instead of being returned
 *   to a bin-cache.
 * - %If the total storage of cached allocations on a given device will exceed
 *   \p max_cached_bytes, allocations for that device are simply freed when they are
 *   deallocated instead of being returned to their bin-cache.
 *
 * \par
 * For example, the default-constructed CachingDeviceAllocator is configured with:
 * - \p bin_growth = 8
 * - \p min_bin = 3
 * - \p max_bin = 7
 * - \p max_cached_bytes = 6MB - 1B
 *
 * \par
 * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
 * and sets a maximum of 6,291,455 cached bytes per device
 *
 */
struct CachingDeviceAllocator
{
#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document


    //---------------------------------------------------------------------
    // Type definitions and constants
    //---------------------------------------------------------------------

    enum
    {
        /// Invalid device ordinal
        INVALID_DEVICE_ORDINAL = -1,
    };

    /**
     * Integer pow function for unsigned base and exponent
     */
    static unsigned int IntPow(
        unsigned int base,
        unsigned int exp)
    {
        unsigned int retval = 1;
        while (exp > 0)
        {
            if (exp & 1) {
                retval = retval * base;        // multiply the result by the current base
            }
            base = base * base;                // square the base
            exp = exp >> 1;                    // divide the exponent in half
        }
        return retval;
    }


    /**
     * Round up to the nearest power-of
     */
    static void NearestPowerOf(
        unsigned int &power,
        size_t &rounded_bytes,
        unsigned int base,
        size_t value)
    {
        power = 0;
        rounded_bytes = 1;

        while (rounded_bytes < value)
        {
            rounded_bytes *= base;
            power++;
        }
    }

    /**
     * Descriptor for device memory allocations
     */
    struct BlockDescriptor
    {
        int   device;        // device ordinal
        void*           d_ptr;      // Device pointer
        size_t          bytes;      // Size of allocation in bytes
        unsigned int    bin;        // Bin enumeration

        // Constructor
        BlockDescriptor(void *d_ptr, int device) :
            d_ptr(d_ptr),
            bytes(0),
            bin(0),
            device(device) {}

        // Constructor
        BlockDescriptor(size_t bytes, unsigned int bin, int device) :
            d_ptr(NULL),
            bytes(bytes),
            bin(bin),
            device(device) {}

        // Comparison functor for comparing device pointers
        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
        {
            if (a.device < b.device) {
                return true;
            } else if (a.device > b.device) {
                return false;
            } else {
                return (a.d_ptr < b.d_ptr);
            }
        }

        // Comparison functor for comparing allocation sizes
        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
        {
            if (a.device < b.device) {
                return true;
            } else if (a.device > b.device) {
                return false;
            } else {
                return (a.bytes < b.bytes);
            }
        }
    };

    /// BlockDescriptor comparator function interface
    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);

#ifndef __CUDA_ARCH__   // Only define STL container members in host code

    /// Set type for cached blocks (ordered by size)
    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;

    /// Set type for live blocks (ordered by ptr)
    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;

    /// Map type of device ordinals to the number of cached bytes cached by each device
    typedef std::map<int, size_t> GpuCachedBytes;

#endif // __CUDA_ARCH__

    //---------------------------------------------------------------------
    // Fields
    //---------------------------------------------------------------------

    Spinlock        spin_lock;          /// Spinlock for thread-safety

    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
    unsigned int    min_bin;            /// Minimum bin enumeration
    unsigned int    max_bin;            /// Maximum bin enumeration

    size_t          min_bin_bytes;      /// Minimum bin size
    size_t          max_bin_bytes;      /// Maximum bin size
    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device

    bool            debug;              /// Whether or not to print (de)allocation events to stdout
    bool            skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)

#ifndef __CUDA_ARCH__   // Only define STL container members in host code

    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use

#endif // __CUDA_ARCH__

#endif // DOXYGEN_SHOULD_SKIP_THIS

    //---------------------------------------------------------------------
    // Methods
    //---------------------------------------------------------------------

    /**
     * \brief Constructor.
     */
    CachingDeviceAllocator(
        unsigned int bin_growth,    ///< Geometric growth factor for bin-sizes
        unsigned int min_bin,       ///< Minimum bin
        unsigned int max_bin,       ///< Maximum bin
        size_t max_cached_bytes)    ///< Maximum aggregate cached bytes per device
    :
    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
            cached_blocks(BlockDescriptor::SizeCompare),
            live_blocks(BlockDescriptor::PtrCompare),
    #endif
            debug(false),
            spin_lock(0),
            bin_growth(bin_growth),
            min_bin(min_bin),
            max_bin(max_bin),
            min_bin_bytes(IntPow(bin_growth, min_bin)),
            max_bin_bytes(IntPow(bin_growth, max_bin)),
            max_cached_bytes(max_cached_bytes)
    {}


    /**
     * \brief Default constructor.
     *
     * Configured with:
     * \par
     * - \p bin_growth = 8
     * - \p min_bin = 3
     * - \p max_bin = 7
     * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
     *
     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
     * sets a maximum of 6,291,455 cached bytes per device
     */
    CachingDeviceAllocator(bool skip_cleanup = false) :
    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
        cached_blocks(BlockDescriptor::SizeCompare),
        live_blocks(BlockDescriptor::PtrCompare),
    #endif
        skip_cleanup(skip_cleanup),
        debug(false),
        spin_lock(0),
        bin_growth(8),
        min_bin(3),
        max_bin(7),
        min_bin_bytes(IntPow(bin_growth, min_bin)),
        max_bin_bytes(IntPow(bin_growth, max_bin)),
        max_cached_bytes((max_bin_bytes * 3) - 1)
    {}


    /**
     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
     */
    cudaError_t SetMaxCachedBytes(
        size_t max_cached_bytes)
    {
    #ifdef __CUDA_ARCH__
        // Caching functionality only defined on host
        return CubDebug(cudaErrorInvalidConfiguration);
    #else

        // Lock
        Lock(&spin_lock);

        this->max_cached_bytes = max_cached_bytes;

        if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);

        // Unlock
        Unlock(&spin_lock);

        return cudaSuccess;

    #endif  // __CUDA_ARCH__
    }


    /**
     * \brief Provides a suitable allocation of device memory for the given size on the specified device
     */
    cudaError_t DeviceAllocate(
        void** d_ptr,
        size_t bytes,
        int device)
    {
    #ifdef __CUDA_ARCH__
        // Caching functionality only defined on host
        return CubDebug(cudaErrorInvalidConfiguration);
    #else

        bool locked                     = false;
        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
        cudaError_t error               = cudaSuccess;

        // Round up to nearest bin size
        unsigned int bin;
        size_t bin_bytes;
        NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
        if (bin < min_bin) {
            bin = min_bin;
            bin_bytes = min_bin_bytes;
        }

        // Check if bin is greater than our maximum bin
        if (bin > max_bin)
        {
            // Allocate the request exactly and give out-of-range bin
            bin = (unsigned int) -1;
            bin_bytes = bytes;
        }

        BlockDescriptor search_key(bin_bytes, bin, device);

        // Lock
        if (!locked) {
            Lock(&spin_lock);
            locked = true;
        }

        do {
            // Find a free block big enough within the same bin on the same device
            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
            if ((block_itr != cached_blocks.end()) &&
                (block_itr->device == device) &&
                (block_itr->bin == search_key.bin))
            {
                // Reuse existing cache block.  Insert into live blocks.
                search_key = *block_itr;
                live_blocks.insert(search_key);

                // Remove from free blocks
                cached_blocks.erase(block_itr);
                cached_bytes[device] -= search_key.bytes;

                if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
            }
            else
            {
                // Need to allocate a new cache block. Unlock.
                if (locked) {
                    Unlock(&spin_lock);
                    locked = false;
                }

                // Set to specified device
                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
                if (CubDebug(error = cudaSetDevice(device))) break;

                // Allocate
                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;

                // Lock
                if (!locked) {
                    Lock(&spin_lock);
                    locked = true;
                }

                // Insert into live blocks
                live_blocks.insert(search_key);

                if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
            }
        } while(0);

        // Unlock
        if (locked) {
            Unlock(&spin_lock);
            locked = false;
        }

        // Copy device pointer to output parameter (NULL on error)
        *d_ptr = search_key.d_ptr;

        // Attempt to revert back to previous device if necessary
        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
        {
            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
        }

        return error;

    #endif  // __CUDA_ARCH__
    }


    /**
     * \brief Provides a suitable allocation of device memory for the given size on the current device
     */
    cudaError_t DeviceAllocate(
        void** d_ptr,
        size_t bytes)
    {
    #ifdef __CUDA_ARCH__
        // Caching functionality only defined on host
        return CubDebug(cudaErrorInvalidConfiguration);
    #else
        cudaError_t error = cudaSuccess;
        do {
            int current_device;
            if (CubDebug(error = cudaGetDevice(&current_device))) break;
            if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
        } while(0);

        return error;

    #endif  // __CUDA_ARCH__
    }


    /**
     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
     */
    cudaError_t DeviceFree(
        void* d_ptr,
        int device)
    {
    #ifdef __CUDA_ARCH__
        // Caching functionality only defined on host
        return CubDebug(cudaErrorInvalidConfiguration);
    #else

        bool locked                     = false;
        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
        cudaError_t error               = cudaSuccess;

        BlockDescriptor search_key(d_ptr, device);

        // Lock
        if (!locked) {
            Lock(&spin_lock);
            locked = true;
        }

        do {
            // Find corresponding block descriptor
            BusyBlocks::iterator block_itr = live_blocks.find(search_key);
            if (block_itr == live_blocks.end())
            {
                // Cannot find pointer
                if (CubDebug(error = cudaErrorUnknown)) break;
            }
            else
            {
                // Remove from live blocks
                search_key = *block_itr;
                live_blocks.erase(block_itr);

                // Check if we should keep the returned allocation
                if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
                {
                    // Insert returned allocation into free blocks
                    cached_blocks.insert(search_key);
                    cached_bytes[device] += search_key.bytes;

                    if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
                }
                else
                {
                    // Free the returned allocation.  Unlock.
                    if (locked) {
                        Unlock(&spin_lock);
                        locked = false;
                    }

                    // Set to specified device
                    if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
                    if (CubDebug(error = cudaSetDevice(device))) break;

                    // Free device memory
                    if (CubDebug(error = cudaFree(d_ptr))) break;

                    if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
                }
            }
        } while (0);

        // Unlock
        if (locked) {
            Unlock(&spin_lock);
            locked = false;
        }

        // Attempt to revert back to entry-point device if necessary
        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
        {
            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
        }

        return error;

    #endif  // __CUDA_ARCH__
    }


    /**
     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator
     */
    cudaError_t DeviceFree(
        void* d_ptr)
    {
    #ifdef __CUDA_ARCH__
        // Caching functionality only defined on host
        return CubDebug(cudaErrorInvalidConfiguration);
    #else

        int current_device;
        cudaError_t error = cudaSuccess;

        do {
            if (CubDebug(error = cudaGetDevice(&current_device))) break;
            if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
        } while(0);

        return error;

    #endif  // __CUDA_ARCH__
    }


    /**
     * \brief Frees all cached device allocations on all devices
     */
    cudaError_t FreeAllCached()
    {
    #ifdef __CUDA_ARCH__
        // Caching functionality only defined on host
        return CubDebug(cudaErrorInvalidConfiguration);
    #else

        cudaError_t error         = cudaSuccess;
        bool locked               = false;
        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
        int current_device        = INVALID_DEVICE_ORDINAL;

        // Lock
        if (!locked) {
            Lock(&spin_lock);
            locked = true;
        }

        while (!cached_blocks.empty())
        {
            // Get first block
            CachedBlocks::iterator begin = cached_blocks.begin();

            // Get entry-point device ordinal if necessary
            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
            {
                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
            }

            // Set current device ordinal if necessary
            if (begin->device != current_device)
            {
                if (CubDebug(error = cudaSetDevice(begin->device))) break;
                current_device = begin->device;
            }

            // Free device memory
            if (CubDebug(error = cudaFree(begin->d_ptr))) break;

            // Reduce balance and erase entry
            cached_bytes[current_device] -= begin->bytes;
            cached_blocks.erase(begin);

            if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
        }

        // Unlock
        if (locked) {
            Unlock(&spin_lock);
            locked = false;
        }

        // Attempt to revert back to entry-point device if necessary
        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
        {
            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
        }

        return error;

    #endif  // __CUDA_ARCH__
    }


    /**
     * \brief Destructor
     */
    virtual ~CachingDeviceAllocator()
    {
        if (!skip_cleanup)
            FreeAllCached();
    }

};


/** @} */       // end group UtilModule

}               // CUB namespace
CUB_NS_POSTFIX  // Optional outer namespace(s)