mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
1106 lines
31 KiB
C++
1106 lines
31 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/common/cudaUtils.h"
|
|
#include "tensorrt_llm/common/logger.h"
|
|
#include "tensorrt_llm/runtime/cudaMemPool.h"
|
|
#include "tensorrt_llm/runtime/cudaStream.h"
|
|
#include "tensorrt_llm/runtime/iBuffer.h"
|
|
#include "tensorrt_llm/runtime/iTensor.h"
|
|
#include "tensorrt_llm/runtime/ipcNvlsMemory.h"
|
|
#include "tensorrt_llm/runtime/memoryCounters.h"
|
|
#include "tensorrt_llm/runtime/virtualMemory.h"
|
|
|
|
#include <NvInferRuntime.h>
|
|
#include <cuda_runtime_api.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstdlib>
|
|
#include <list>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <tuple>
|
|
#include <type_traits>
|
|
#include <vector>
|
|
|
|
namespace tensorrt_llm::runtime
|
|
{
|
|
|
|
// CRTP base class
|
|
template <typename TDerived, MemoryType memoryType, bool count = true>
|
|
class BaseAllocator
|
|
{
|
|
public:
|
|
using ValueType = void;
|
|
using PointerType = ValueType*;
|
|
static auto constexpr kMemoryType = memoryType;
|
|
|
|
PointerType allocate(std::size_t n)
|
|
{
|
|
PointerType ptr{};
|
|
static_cast<TDerived*>(this)->allocateImpl(&ptr, n);
|
|
if constexpr (count)
|
|
{
|
|
MemoryCounters::getInstance().allocate<memoryType>(n);
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
void deallocate(PointerType ptr, std::size_t n)
|
|
{
|
|
if (ptr)
|
|
{
|
|
static_cast<TDerived*>(this)->deallocateImpl(ptr, n);
|
|
if constexpr (count)
|
|
{
|
|
MemoryCounters::getInstance().deallocate<memoryType>(n);
|
|
}
|
|
}
|
|
}
|
|
|
|
[[nodiscard]] MemoryType constexpr getMemoryType() const
|
|
{
|
|
return memoryType;
|
|
}
|
|
};
|
|
|
|
class CudaAllocator : public BaseAllocator<CudaAllocator, MemoryType::kGPU>
|
|
{
|
|
friend class BaseAllocator<CudaAllocator, MemoryType::kGPU>;
|
|
|
|
public:
|
|
CudaAllocator() noexcept = default;
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n) // NOLINT(readability-convert-member-functions-to-static)
|
|
{
|
|
TLLM_CUDA_CHECK(::cudaMalloc(ptr, n));
|
|
}
|
|
|
|
void deallocateImpl( // NOLINT(readability-convert-member-functions-to-static)
|
|
PointerType ptr, [[maybe_unused]] std::size_t n)
|
|
{
|
|
TLLM_CUDA_CHECK_FREE_RESOURCE(::cudaFree(ptr));
|
|
}
|
|
};
|
|
|
|
class CudaAllocatorAsync : public BaseAllocator<CudaAllocatorAsync, MemoryType::kGPU>
|
|
{
|
|
friend class BaseAllocator<CudaAllocatorAsync, MemoryType::kGPU>;
|
|
|
|
public:
|
|
using CudaStreamPtr = std::shared_ptr<CudaStream>;
|
|
using CudaPoolPtr = std::shared_ptr<CudaMemPool>;
|
|
|
|
explicit CudaAllocatorAsync(CudaStreamPtr stream, CudaPoolPtr memPool)
|
|
: mCudaStream(std::move(stream))
|
|
, mMemPool(std::move(memPool))
|
|
{
|
|
TLLM_CHECK_WITH_INFO(static_cast<bool>(mCudaStream), "Undefined CUDA stream");
|
|
TLLM_CHECK_WITH_INFO(static_cast<bool>(mMemPool), "Undefined CUDA mem pool");
|
|
}
|
|
|
|
[[nodiscard]] CudaStreamPtr getCudaStream() const
|
|
{
|
|
return mCudaStream;
|
|
}
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n)
|
|
{
|
|
TLLM_CUDA_CHECK(::cudaMallocAsync(ptr, n, mMemPool->getPool(), mCudaStream->get()));
|
|
}
|
|
|
|
void deallocateImpl(PointerType ptr, [[maybe_unused]] std::size_t n)
|
|
{
|
|
TLLM_CUDA_CHECK_FREE_RESOURCE(::cudaFreeAsync(ptr, mCudaStream->get()));
|
|
}
|
|
|
|
private:
|
|
CudaStreamPtr mCudaStream;
|
|
CudaPoolPtr mMemPool;
|
|
};
|
|
|
|
class UVMAllocator : public BaseAllocator<UVMAllocator, MemoryType::kUVM>
|
|
{
|
|
friend class BaseAllocator<UVMAllocator, MemoryType::kUVM>;
|
|
|
|
public:
|
|
using Base = BaseAllocator<UVMAllocator, MemoryType::kUVM>;
|
|
UVMAllocator() noexcept = default;
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n) // NOLINT(readability-convert-member-functions-to-static)
|
|
{
|
|
TLLM_CUDA_CHECK(::cudaMallocManaged(ptr, n));
|
|
// TLLM_CUDA_CHECK(::cudaMemAdvise(ptr, n, cudaMemAdviseSetPreferredLocation, 0));
|
|
}
|
|
|
|
void deallocateImpl( // NOLINT(readability-convert-member-functions-to-static)
|
|
PointerType ptr, [[maybe_unused]] std::size_t n)
|
|
{
|
|
TLLM_CUDA_CHECK_FREE_RESOURCE(::cudaFree(ptr));
|
|
}
|
|
};
|
|
|
|
class PinnedAllocator : public BaseAllocator<PinnedAllocator, MemoryType::kPINNED>
|
|
{
|
|
friend class BaseAllocator<PinnedAllocator, MemoryType::kPINNED>;
|
|
|
|
public:
|
|
using Base = BaseAllocator<PinnedAllocator, MemoryType::kPINNED>;
|
|
PinnedAllocator() noexcept = default;
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n) // NOLINT(readability-convert-member-functions-to-static)
|
|
{
|
|
TLLM_CUDA_CHECK(::cudaHostAlloc(ptr, n, cudaHostAllocDefault));
|
|
}
|
|
|
|
void deallocateImpl( // NOLINT(readability-convert-member-functions-to-static)
|
|
PointerType ptr, [[maybe_unused]] std::size_t n)
|
|
{
|
|
TLLM_CUDA_CHECK_FREE_RESOURCE(::cudaFreeHost(ptr));
|
|
}
|
|
};
|
|
|
|
class HostAllocator : public BaseAllocator<HostAllocator, MemoryType::kCPU>
|
|
{
|
|
friend class BaseAllocator<HostAllocator, MemoryType::kCPU>;
|
|
|
|
public:
|
|
HostAllocator() noexcept = default;
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n) // NOLINT(readability-convert-member-functions-to-static)
|
|
{
|
|
*ptr = std::malloc(n);
|
|
if (*ptr == nullptr)
|
|
{
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
|
|
void deallocateImpl( // NOLINT(readability-convert-member-functions-to-static)
|
|
PointerType ptr, [[maybe_unused]] std::size_t n)
|
|
{
|
|
std::free(ptr);
|
|
}
|
|
};
|
|
|
|
template <MemoryType memoryType>
|
|
class BorrowingAllocator : public BaseAllocator<BorrowingAllocator<memoryType>, memoryType, false>
|
|
{
|
|
friend class BaseAllocator<BorrowingAllocator<memoryType>, memoryType, false>;
|
|
|
|
public:
|
|
using Base = BaseAllocator<BorrowingAllocator<memoryType>, memoryType, false>;
|
|
using PointerType = typename Base::PointerType;
|
|
|
|
BorrowingAllocator(void* ptr, std::size_t capacity)
|
|
: mPtr(ptr)
|
|
, mCapacity(capacity)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(capacity == std::size_t(0) || static_cast<bool>(mPtr), "Undefined pointer");
|
|
}
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n) // NOLINT(readability-convert-member-functions-to-static)
|
|
{
|
|
if (n <= mCapacity)
|
|
{
|
|
*ptr = mPtr;
|
|
}
|
|
else
|
|
{
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
|
|
void deallocateImpl( // NOLINT(readability-convert-member-functions-to-static)
|
|
[[maybe_unused]] PointerType ptr, [[maybe_unused]] std::size_t n)
|
|
{
|
|
}
|
|
|
|
private:
|
|
PointerType mPtr;
|
|
std::size_t mCapacity;
|
|
};
|
|
|
|
using CpuBorrowingAllocator = BorrowingAllocator<MemoryType::kCPU>;
|
|
using GpuBorrowingAllocator = BorrowingAllocator<MemoryType::kGPU>;
|
|
using PinnedBorrowingAllocator = BorrowingAllocator<MemoryType::kPINNED>;
|
|
using ManagedBorrowingAllocator = BorrowingAllocator<MemoryType::kUVM>;
|
|
using PinnedPoolBorrowingAllocator = BorrowingAllocator<MemoryType::kPINNEDPOOL>;
|
|
|
|
// using UVMBorrowingAllocator = BorrowingAllocator<MemoryType::kUVM>;
|
|
|
|
/**
|
|
* A memory manager that acts as a memory pool, preallocating a configurable
|
|
* amount of memory. It is able to grow in size and allocate memory chunks as required.
|
|
*/
|
|
template <typename TAllocator>
|
|
class MemoryPool : public BaseAllocator<MemoryPool<TAllocator>, TAllocator::kMemoryType, false>
|
|
{
|
|
friend class BaseAllocator<MemoryPool<TAllocator>, TAllocator::kMemoryType, false>;
|
|
|
|
public:
|
|
using Base = BaseAllocator<MemoryPool<TAllocator>, TAllocator::kMemoryType, false>;
|
|
using PointerType = typename Base::PointerType;
|
|
|
|
using Allocator = TAllocator;
|
|
static_assert(std::is_same_v<typename Allocator::PointerType, PointerType>);
|
|
|
|
static std::size_t constexpr kInitialChunkSize{std::size_t{1} << 29}; // 512 MB
|
|
static std::size_t constexpr kAlignment{256};
|
|
|
|
explicit MemoryPool(std::size_t chunkSize = kInitialChunkSize, Allocator allocator = Allocator{})
|
|
: mChunkSize(chunkSize)
|
|
, mAllocator{allocator}
|
|
{
|
|
}
|
|
|
|
~MemoryPool()
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
TLLM_LOG_DEBUG("MemoryPool: Deallocating %zu chunks", mAllocatedChunks.size());
|
|
for (auto const& [ptr, size] : mAllocatedChunks)
|
|
{
|
|
TLLM_LOG_DEBUG("MemoryPool: Deallocating %zu B", size);
|
|
try
|
|
{
|
|
mAllocator.deallocate(ptr, size);
|
|
}
|
|
catch (std::exception const& e)
|
|
{
|
|
TLLM_LOG_EXCEPTION(e);
|
|
}
|
|
}
|
|
mAllocatedChunks.clear();
|
|
}
|
|
|
|
[[nodiscard]] std::size_t getChunkSize() const
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
return mChunkSize;
|
|
}
|
|
|
|
void setChunkSize(std::size_t chunkSize)
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
mChunkSize = chunkSize;
|
|
}
|
|
|
|
[[nodiscard]] std::size_t getUsedSize() const
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
return std::accumulate(mMemorySegments.cbegin(), mMemorySegments.cend(), std::size_t{0},
|
|
[](std::size_t sum, auto const& chunk) { return chunk.tag ? sum + chunk.size : sum; });
|
|
}
|
|
|
|
[[nodiscard]] std::size_t getReservedSize() const
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
return std::accumulate(mAllocatedChunks.cbegin(), mAllocatedChunks.cend(), std::size_t{0},
|
|
[](std::size_t sum, auto const& chunk) { return sum + std::get<1>(chunk); });
|
|
}
|
|
|
|
class MemorySegment
|
|
{
|
|
public:
|
|
MemorySegment(PointerType basePointer, std::size_t size, std::size_t offset = 0, PointerType tag = nullptr)
|
|
: basePointer{basePointer}
|
|
, size{size}
|
|
, offset{offset}
|
|
, tag{tag}
|
|
{
|
|
}
|
|
|
|
PointerType const basePointer;
|
|
std::size_t size;
|
|
std::size_t offset;
|
|
PointerType tag;
|
|
};
|
|
|
|
// for debugging purposes only
|
|
std::list<MemorySegment> const& getMemorySegments() const
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
return mMemorySegments;
|
|
}
|
|
|
|
// for debugging purposes only
|
|
void logSegments() const;
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t requestedSize);
|
|
|
|
void deallocateImpl(PointerType tag, std::size_t n);
|
|
|
|
private:
|
|
std::size_t mChunkSize;
|
|
TAllocator mAllocator;
|
|
std::mutex mutable mLock{};
|
|
|
|
std::list<MemorySegment> mMemorySegments = {};
|
|
std::vector<std::tuple<PointerType, std::size_t>> mAllocatedChunks = {};
|
|
|
|
void allocateChunk()
|
|
{
|
|
TLLM_LOG_DEBUG("MemoryPool: Allocating %zu B", mChunkSize);
|
|
auto basePointer = mAllocator.allocate(mChunkSize);
|
|
mAllocatedChunks.emplace_back(basePointer, mChunkSize);
|
|
mMemorySegments.push_back(MemorySegment{basePointer, mChunkSize});
|
|
}
|
|
};
|
|
|
|
template <typename TAllocator>
|
|
void MemoryPool<TAllocator>::allocateImpl(MemoryPool::PointerType* ptr, std::size_t requestedSize)
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
|
|
// Align requested size to kAlignment
|
|
// When requesting 0 B, default to allocating 1 B (from "Effective C++", item 51)
|
|
// See https://stackoverflow.com/questions/2660076/returning-aligned-memory-with-new
|
|
std::size_t const alignedRequest{
|
|
requestedSize == 0 ? kAlignment : common::ceilDiv(requestedSize, kAlignment) * kAlignment};
|
|
|
|
TLLM_LOG_DEBUG("MemoryPool: Requested to reserve %zu B (%zu B aligned)", requestedSize, alignedRequest);
|
|
|
|
// Finds first free segment providing sufficient space
|
|
auto it = std::find_if(mMemorySegments.begin(), mMemorySegments.end(),
|
|
[alignedRequest](auto const& ms) { return ms.tag == nullptr && ms.size >= alignedRequest; });
|
|
|
|
if (it == mMemorySegments.end())
|
|
{
|
|
// There is no space available for this request:
|
|
// Adapt mChunkSize to the aligned requested size in case it doesn't fit,
|
|
// allocate a chunk of mChunkSize and fulfill this request
|
|
TLLM_LOG_DEBUG("MemoryPool: Needs more space to accommodate request of %zu B", requestedSize);
|
|
if (mChunkSize < alignedRequest)
|
|
{
|
|
mChunkSize = alignedRequest;
|
|
TLLM_LOG_DEBUG("MemoryPool: Increasing chunk size to %zu B", mChunkSize);
|
|
}
|
|
allocateChunk();
|
|
it = std::prev(mMemorySegments.end());
|
|
}
|
|
|
|
// Start of allocation
|
|
auto const offset = it->offset;
|
|
auto const basePointer = it->basePointer;
|
|
|
|
// Update current segment
|
|
it->offset += alignedRequest;
|
|
it->size -= alignedRequest;
|
|
if (it->size == 0)
|
|
{
|
|
it = mMemorySegments.erase(it);
|
|
}
|
|
|
|
// Update pointer
|
|
*ptr = static_cast<PointerType>(static_cast<std::uint8_t*>(basePointer) + offset);
|
|
|
|
// Insert an occupied segment
|
|
mMemorySegments.insert(it, MemorySegment{basePointer, alignedRequest, offset, *ptr});
|
|
}
|
|
|
|
template <typename TAllocator>
|
|
void MemoryPool<TAllocator>::deallocateImpl(PointerType tag, std::size_t n)
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
auto it = std::find_if(mMemorySegments.begin(), mMemorySegments.end(),
|
|
[&tag](MemorySegment const& segment) { return segment.tag == tag; });
|
|
|
|
TLLM_CHECK_WITH_INFO(it != mMemorySegments.end(), "MemoryPool free: Requested tag %p could not be found", tag);
|
|
|
|
// Free found tag
|
|
it->tag = nullptr;
|
|
|
|
if (it->size < n)
|
|
{
|
|
TLLM_LOG_WARNING("MemoryPool: Requested to free %zu B, but only %zu B available", n, it->size);
|
|
}
|
|
|
|
// Check if previous segment is free, in which case, join
|
|
if (it != mMemorySegments.begin())
|
|
{
|
|
auto previousIt = std::prev(it);
|
|
if (previousIt->tag == nullptr && previousIt->basePointer == it->basePointer)
|
|
{
|
|
previousIt->size += it->size;
|
|
// Remove current element, and point to previous one
|
|
it = std::prev(mMemorySegments.erase(it));
|
|
}
|
|
}
|
|
|
|
// Check if next segment is free, in which case, join
|
|
if (std::next(it) != mMemorySegments.end())
|
|
{
|
|
auto nextIt = std::next(it);
|
|
if (nextIt->tag == nullptr && nextIt->basePointer == it->basePointer)
|
|
{
|
|
it->size += nextIt->size;
|
|
// Remove next tag
|
|
mMemorySegments.erase(nextIt);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename TAllocator>
|
|
void MemoryPool<TAllocator>::logSegments() const
|
|
{
|
|
std::lock_guard<std::mutex> lock(mLock);
|
|
TLLM_LOG_DEBUG("MemoryPool segments:");
|
|
for (auto ms : mMemorySegments)
|
|
{
|
|
TLLM_LOG_DEBUG("* Segment size %zu, tag %p, basePointer %p", ms.size, ms.tag, ms.basePointer);
|
|
}
|
|
}
|
|
|
|
template <typename TAllocator>
|
|
class PoolAllocator : public BaseAllocator<PoolAllocator<TAllocator>, TAllocator::kMemoryType, false>
|
|
{
|
|
friend class BaseAllocator<PoolAllocator<TAllocator>, TAllocator::kMemoryType, false>;
|
|
|
|
public:
|
|
using Base = BaseAllocator<PoolAllocator<TAllocator>, TAllocator::kMemoryType, false>;
|
|
using PointerType = typename Base::PointerType;
|
|
using PoolType = MemoryPool<TAllocator>;
|
|
|
|
static PoolType& getPool();
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n) // NOLINT(readability-convert-member-functions-to-static)
|
|
{
|
|
*ptr = getPool().allocate(n);
|
|
}
|
|
|
|
void deallocateImpl( // NOLINT(readability-convert-member-functions-to-static)
|
|
typename TAllocator::PointerType ptr, std::size_t n)
|
|
{
|
|
getPool().deallocate(ptr, n);
|
|
}
|
|
};
|
|
|
|
using PinnedPoolAllocator = PoolAllocator<PinnedAllocator>;
|
|
|
|
class CudaVirtualMemoryAllocatorAdaptor
|
|
: public BaseAllocator<CudaVirtualMemoryAllocatorAdaptor, MemoryType::kGPU, /* count */ false>,
|
|
CudaVirtualMemoryAllocator
|
|
{
|
|
// Update to MemoryCounters is done in Creator to more precisely reflect the memory usage.
|
|
using Base = BaseAllocator<CudaVirtualMemoryAllocatorAdaptor, MemoryType::kGPU, false>;
|
|
friend Base;
|
|
|
|
public:
|
|
// No explicit, to allow implicit conversion from CudaVirtualMemoryAllocator
|
|
CudaVirtualMemoryAllocatorAdaptor(CudaVirtualMemoryAllocator const& allocator)
|
|
: CudaVirtualMemoryAllocator(allocator)
|
|
{
|
|
}
|
|
|
|
using Base::allocate;
|
|
using Base::deallocate;
|
|
|
|
protected:
|
|
void allocateImpl(PointerType* ptr, std::size_t n) const
|
|
{
|
|
this->CudaVirtualMemoryAllocator::allocate(ptr, n, tensorrt_llm::common::getDevice());
|
|
}
|
|
|
|
void deallocateImpl(PointerType ptr, std::size_t n) const
|
|
{
|
|
this->CudaVirtualMemoryAllocator::deallocate(ptr, n);
|
|
}
|
|
};
|
|
|
|
// Adopted from https://github.com/NVIDIA/TensorRT/blob/release/8.6/samples/common/buffers.h
|
|
|
|
//!
|
|
//! \brief The GenericBuffer class is a templated class for buffers.
|
|
//!
|
|
//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation,
|
|
//! deallocation, querying of buffers on both the device and the host.
|
|
//! It can handle data of arbitrary types because it stores byte buffers.
|
|
//! The template parameter TAllocator must inherit from BaseAllocator.
|
|
//!
|
|
template <typename TAllocator>
|
|
class GenericBuffer : virtual public IBuffer, TAllocator // Inherit from TAllocator for EBO
|
|
{
|
|
public:
|
|
using AllocatorType = TAllocator;
|
|
|
|
//!
|
|
//! \brief Construct an empty buffer.
|
|
//!
|
|
explicit GenericBuffer(nvinfer1::DataType type, TAllocator allocator = {}) // NOLINT(*-pro-type-member-init)
|
|
: GenericBuffer{0, type, std::move(allocator)}
|
|
{
|
|
}
|
|
|
|
//!
|
|
//! \brief Construct a buffer with the specified allocation size in number of elements.
|
|
//!
|
|
explicit GenericBuffer( // NOLINT(*-pro-type-member-init)
|
|
std::size_t size, nvinfer1::DataType type, TAllocator allocator = {})
|
|
: GenericBuffer{size, size, type, std::move(allocator)}
|
|
{
|
|
}
|
|
|
|
GenericBuffer(GenericBuffer const& other) = delete;
|
|
GenericBuffer& operator=(GenericBuffer const& buf) = delete;
|
|
|
|
GenericBuffer(GenericBuffer&& buf) noexcept
|
|
: TAllocator(static_cast<TAllocator&&>(buf))
|
|
, mSize{buf.mSize}
|
|
, mCapacity{buf.mCapacity}
|
|
, mType{buf.mType}
|
|
, mBuffer{buf.mBuffer}
|
|
{
|
|
buf.mSize = 0;
|
|
buf.mCapacity = 0;
|
|
buf.mBuffer = nullptr;
|
|
}
|
|
|
|
GenericBuffer& operator=(GenericBuffer&& buf) noexcept
|
|
{
|
|
if (this != &buf)
|
|
{
|
|
this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
|
|
mSize = buf.mSize;
|
|
mCapacity = buf.mCapacity;
|
|
mType = buf.mType;
|
|
*static_cast<TAllocator*>(this) = static_cast<TAllocator&&>(buf);
|
|
mBuffer = buf.mBuffer;
|
|
// Reset buf.
|
|
buf.mSize = 0;
|
|
buf.mCapacity = 0;
|
|
buf.mBuffer = nullptr;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
using IBuffer::data;
|
|
|
|
//!
|
|
//! \brief Returns pointer to underlying array.
|
|
//! \details Return nullptr if size == 0 so behavior is consistent with BufferView.
|
|
//!
|
|
void* data() override
|
|
{
|
|
return TLLM_LIKELY(mSize > 0) ? mBuffer : nullptr;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns pointer to underlying array.
|
|
//! \details Return nullptr if size == 0 so behavior is consistent with BufferView.
|
|
//!
|
|
[[nodiscard]] void const* data() const override
|
|
{
|
|
return TLLM_LIKELY(mSize > 0) ? mBuffer : nullptr;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the size (in number of elements) of the buffer.
|
|
//!
|
|
[[nodiscard]] std::size_t getSize() const override
|
|
{
|
|
return mSize;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the capacity of the buffer.
|
|
//!
|
|
[[nodiscard]] std::size_t getCapacity() const override
|
|
{
|
|
return mCapacity;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the type of the buffer.
|
|
//!
|
|
[[nodiscard]] nvinfer1::DataType getDataType() const override
|
|
{
|
|
return mType;
|
|
}
|
|
|
|
//!
|
|
//! \brief Returns the memory type of the buffer.
|
|
//!
|
|
[[nodiscard]] MemoryType getMemoryType() const override
|
|
{
|
|
return this->TAllocator::getMemoryType();
|
|
}
|
|
|
|
//!
|
|
//! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity.
|
|
//!
|
|
void resize(std::size_t newSize) override
|
|
{
|
|
if (mCapacity < newSize)
|
|
{
|
|
this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
|
|
mBuffer = this->TAllocator::allocate(toBytes(newSize));
|
|
mCapacity = newSize;
|
|
}
|
|
mSize = newSize;
|
|
}
|
|
|
|
//!
|
|
//! \brief Releases the buffer.
|
|
//!
|
|
void release() override
|
|
{
|
|
this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
|
|
mSize = 0;
|
|
mCapacity = 0;
|
|
mBuffer = nullptr;
|
|
}
|
|
|
|
~GenericBuffer() override
|
|
{
|
|
try
|
|
{
|
|
this->TAllocator::deallocate(mBuffer, toBytes(mCapacity));
|
|
}
|
|
catch (std::exception const& e)
|
|
{
|
|
TLLM_LOG_EXCEPTION(e);
|
|
}
|
|
}
|
|
|
|
protected:
|
|
explicit GenericBuffer(std::size_t size, std::size_t capacity, nvinfer1::DataType type, TAllocator allocator = {})
|
|
: TAllocator{std::move(allocator)}
|
|
, mSize{size}
|
|
, mCapacity{capacity}
|
|
, mType{type}
|
|
, mBuffer{capacity > 0 ? this->TAllocator::allocate(toBytes(capacity)) : nullptr}
|
|
{
|
|
TLLM_CHECK(size <= capacity);
|
|
TLLM_CHECK(capacity == 0 || size > 0);
|
|
}
|
|
|
|
private:
|
|
std::size_t mSize{0}, mCapacity{0};
|
|
nvinfer1::DataType mType;
|
|
void* mBuffer;
|
|
};
|
|
|
|
class MulticastBuffer : virtual public IBuffer
|
|
{
|
|
public:
|
|
explicit MulticastBuffer(nvinfer1::DataType type, std::set<int> const& ranks)
|
|
: mSize(0)
|
|
, mCapacity(0)
|
|
, mType(type)
|
|
, mRanks(ranks)
|
|
{
|
|
TLLM_CHECK(ranks.size() > 1);
|
|
}
|
|
|
|
explicit MulticastBuffer(size_t size, nvinfer1::DataType type, std::set<int> const& ranks)
|
|
: mSize(0)
|
|
, mCapacity(0)
|
|
, mType(type)
|
|
, mRanks(ranks)
|
|
{
|
|
TLLM_CHECK(size > 0);
|
|
TLLM_CHECK(ranks.size() > 1);
|
|
resize(size);
|
|
}
|
|
|
|
MulticastBuffer(MulticastBuffer& other) = delete;
|
|
MulticastBuffer& operator=(MulticastBuffer const& other) = delete;
|
|
|
|
MulticastBuffer(MulticastBuffer&& other) noexcept
|
|
: mSize(other.mSize)
|
|
, mCapacity(other.mCapacity)
|
|
, mType(other.mType)
|
|
, mRanks(other.mRanks)
|
|
, mHandle(other.mHandle)
|
|
{
|
|
other.mSize = 0;
|
|
other.mCapacity = 0;
|
|
other.mHandle = nullptr;
|
|
}
|
|
|
|
~MulticastBuffer() override
|
|
{
|
|
MulticastBuffer::release();
|
|
}
|
|
|
|
MulticastBuffer& operator=(MulticastBuffer&& other) noexcept
|
|
{
|
|
if (this != &other)
|
|
{
|
|
// free old memory as we are assigning new memory to it
|
|
release();
|
|
|
|
mSize = other.mSize;
|
|
mCapacity = other.mCapacity;
|
|
mType = other.mType;
|
|
mRanks = other.mRanks;
|
|
mHandle = other.mHandle;
|
|
|
|
// reset other
|
|
other.mSize = 0;
|
|
other.mCapacity = 0;
|
|
other.mHandle = nullptr;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
// Return list of pointers to each rank
|
|
[[nodiscard]] void* dataIpcList()
|
|
{
|
|
return reinterpret_cast<void*>(mHandle->ipc_uc_ptrs.data());
|
|
}
|
|
|
|
[[nodiscard]] void const* dataIpcList() const
|
|
{
|
|
return reinterpret_cast<void const*>(mHandle->ipc_uc_ptrs.data());
|
|
}
|
|
|
|
[[nodiscard]] void* dataMC()
|
|
{
|
|
return reinterpret_cast<void*>(mHandle->mc_ptr);
|
|
}
|
|
|
|
[[nodiscard]] void const* dataMC() const
|
|
{
|
|
return reinterpret_cast<void const*>(mHandle->mc_ptr);
|
|
}
|
|
|
|
//////////////////////////
|
|
// Methods from IBuffer
|
|
//////////////////////////
|
|
|
|
using IBuffer::data;
|
|
|
|
// Return unicast pointer
|
|
[[nodiscard]] void* data() override
|
|
{
|
|
return reinterpret_cast<void*>(mHandle->uc_ptr);
|
|
}
|
|
|
|
// Return unicast pointer
|
|
[[nodiscard]] void const* data() const override
|
|
{
|
|
return reinterpret_cast<void const*>(mHandle->uc_ptr);
|
|
}
|
|
|
|
[[nodiscard]] std::size_t getSize() const override
|
|
{
|
|
return mSize;
|
|
}
|
|
|
|
[[nodiscard]] std::size_t getCapacity() const override
|
|
{
|
|
return mCapacity;
|
|
}
|
|
|
|
[[nodiscard]] nvinfer1::DataType getDataType() const override
|
|
{
|
|
return mType;
|
|
}
|
|
|
|
[[nodiscard]] MemoryType getMemoryType() const override
|
|
{
|
|
return MemoryType::kGPU;
|
|
}
|
|
|
|
void resize(std::size_t newSize) override
|
|
{
|
|
TLLM_CHECK(newSize > 0);
|
|
if (mCapacity < newSize)
|
|
{
|
|
release();
|
|
mHandle = ipcNvlsAllocate(toBytes(newSize), mRanks);
|
|
|
|
TLLM_CHECK(mHandle->size % BufferDataType(mType).getSize() == 0);
|
|
mCapacity = mHandle->size / BufferDataType(mType).getSize();
|
|
}
|
|
mSize = newSize;
|
|
}
|
|
|
|
void release() override
|
|
{
|
|
if (mCapacity > 0)
|
|
{
|
|
TLLM_CHECK(mHandle->size > 0);
|
|
ipcNvlsFree(mHandle);
|
|
}
|
|
}
|
|
|
|
private:
|
|
std::size_t mSize = 0;
|
|
std::size_t mCapacity = 0;
|
|
nvinfer1::DataType mType;
|
|
std::set<int> mRanks;
|
|
IpcNvlsHandle* mHandle;
|
|
};
|
|
|
|
using DeviceBuffer = GenericBuffer<CudaAllocatorAsync>;
|
|
using StaticDeviceBuffer = GenericBuffer<CudaAllocator>;
|
|
using HostBuffer = GenericBuffer<HostAllocator>;
|
|
using PinnedBuffer = GenericBuffer<PinnedAllocator>;
|
|
using PinnedPoolBuffer = GenericBuffer<PinnedPoolAllocator>;
|
|
using UVMBuffer = GenericBuffer<UVMAllocator>;
|
|
using VirtualAddressDeviceBuffer = GenericBuffer<CudaVirtualMemoryAllocatorAdaptor>;
|
|
|
|
template <typename T>
|
|
std::make_unsigned_t<T> nonNegative(T value)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(value >= 0, "Value must be non-negative");
|
|
return static_cast<std::make_unsigned_t<T>>(value);
|
|
}
|
|
|
|
template <typename TAllocator>
|
|
class GenericTensor : virtual public ITensor, public GenericBuffer<TAllocator>
|
|
{
|
|
public:
|
|
using Base = GenericBuffer<TAllocator>;
|
|
|
|
//!
|
|
//! \brief Construct an empty tensor.
|
|
//!
|
|
explicit GenericTensor(nvinfer1::DataType type, TAllocator allocator = {})
|
|
: Base{type, std::move(allocator)}
|
|
{
|
|
mDims.nbDims = 0;
|
|
}
|
|
|
|
//!
|
|
//! \brief Construct a tensor with the specified allocation dimensions.
|
|
//!
|
|
explicit GenericTensor(nvinfer1::Dims const& dims, nvinfer1::DataType type, TAllocator allocator = {})
|
|
: Base{nonNegative(volume(dims)), type, std::move(allocator)}
|
|
, mDims{dims}
|
|
{
|
|
}
|
|
|
|
explicit GenericTensor(
|
|
nvinfer1::Dims const& dims, std::size_t capacity, nvinfer1::DataType type, TAllocator allocator = {})
|
|
: Base{nonNegative(volume(dims)), capacity, type, std::move(allocator)}
|
|
, mDims{dims}
|
|
{
|
|
}
|
|
|
|
GenericTensor(GenericTensor&& tensor) noexcept
|
|
: Base{std::move(tensor)}
|
|
, mDims{tensor.dims}
|
|
{
|
|
tensor.mDims.nbDims = 0;
|
|
}
|
|
|
|
GenericTensor& operator=(GenericTensor&& tensor) noexcept
|
|
{
|
|
if (this != &tensor)
|
|
{
|
|
Base::operator=(std::move(tensor));
|
|
mDims = tensor.dims;
|
|
// Reset tensor.
|
|
tensor.mDims.nbDims = 0;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
[[nodiscard]] nvinfer1::Dims const& getShape() const override
|
|
{
|
|
return mDims;
|
|
}
|
|
|
|
void reshape(nvinfer1::Dims const& dims) override
|
|
{
|
|
Base::resize(nonNegative(volume(dims)));
|
|
mDims = dims;
|
|
}
|
|
|
|
void resize(std::size_t newSize) override
|
|
{
|
|
ITensor::resize(newSize);
|
|
}
|
|
|
|
void release() override
|
|
{
|
|
Base::release();
|
|
mDims.nbDims = 0;
|
|
}
|
|
|
|
private:
|
|
nvinfer1::Dims mDims{};
|
|
};
|
|
|
|
// Forward declaration
|
|
class MulticastTensor;
|
|
|
|
class MulticastTensorView : virtual public ITensor
|
|
{
|
|
public:
|
|
enum class ViewType
|
|
{
|
|
kUNICAST,
|
|
kMULTICAST,
|
|
kIPC_LIST
|
|
};
|
|
|
|
explicit MulticastTensorView(std::weak_ptr<MulticastTensor> const& tensor, ViewType viewType);
|
|
|
|
MulticastTensorView(MulticastTensorView&& other) noexcept;
|
|
|
|
[[nodiscard]] MulticastTensorView& operator=(MulticastTensorView&& other) noexcept;
|
|
|
|
/////////////////////
|
|
// ITensor methods
|
|
/////////////////////
|
|
[[nodiscard]] nvinfer1::Dims const& getShape() const override;
|
|
|
|
void reshape(nvinfer1::Dims const& dims) override;
|
|
|
|
/////////////////////
|
|
// IBuffer methods
|
|
/////////////////////
|
|
|
|
[[nodiscard]] std::size_t getSize() const override;
|
|
|
|
[[nodiscard]] std::size_t getCapacity() const override;
|
|
|
|
[[nodiscard]] nvinfer1::DataType getDataType() const override;
|
|
|
|
[[nodiscard]] MemoryType getMemoryType() const override;
|
|
|
|
using ITensor::data;
|
|
|
|
[[nodiscard]] void* data() override
|
|
{
|
|
return _data();
|
|
}
|
|
|
|
[[nodiscard]] void const* data() const override
|
|
{
|
|
return _data();
|
|
}
|
|
|
|
void resize(std::size_t newSize) override
|
|
{
|
|
TLLM_THROW("Cannot resize() MulticastTensorView");
|
|
}
|
|
|
|
void release() override
|
|
{
|
|
TLLM_THROW("Cannot release() MulticastTensorView");
|
|
}
|
|
|
|
private:
|
|
[[nodiscard]] std::shared_ptr<MulticastBuffer> lock() const;
|
|
|
|
[[nodiscard]] void* _data() const;
|
|
|
|
std::weak_ptr<MulticastTensor> mTensor;
|
|
ViewType mViewType;
|
|
nvinfer1::Dims mDims{};
|
|
};
|
|
|
|
class MulticastTensor : virtual public ITensor,
|
|
public MulticastBuffer,
|
|
public std::enable_shared_from_this<MulticastTensor>
|
|
{
|
|
public:
|
|
using Base = MulticastBuffer;
|
|
|
|
explicit MulticastTensor(nvinfer1::DataType type, std::set<int> const& ranks)
|
|
: Base(type, ranks)
|
|
{
|
|
mDims.nbDims = 0;
|
|
}
|
|
|
|
explicit MulticastTensor(nvinfer1::Dims const& dims, nvinfer1::DataType type, std::set<int> const& ranks)
|
|
: Base(nonNegative(volume(dims)), type, ranks)
|
|
, mDims(dims)
|
|
{
|
|
}
|
|
|
|
MulticastTensor(MulticastTensor& other) = delete;
|
|
MulticastTensor& operator=(MulticastTensor const& other) = delete;
|
|
|
|
MulticastTensor(MulticastTensor&& tensor) noexcept
|
|
: Base(std::move(tensor))
|
|
, mDims(tensor.mDims)
|
|
{
|
|
tensor.mDims.nbDims = 0;
|
|
}
|
|
|
|
[[nodiscard]] MulticastTensor& operator=(MulticastTensor&& tensor) noexcept
|
|
{
|
|
if (this != &tensor)
|
|
{
|
|
Base::operator=(std::move(tensor));
|
|
mDims = tensor.mDims;
|
|
// Reset tensor.
|
|
tensor.mDims.nbDims = 0;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
[[nodiscard]] std::shared_ptr<ITensor> getTensorView(MulticastTensorView::ViewType viewType)
|
|
{
|
|
return std::make_shared<MulticastTensorView>(weak_from_this(), viewType);
|
|
}
|
|
|
|
/////////////////////
|
|
// ITensor methods
|
|
/////////////////////
|
|
[[nodiscard]] nvinfer1::Dims const& getShape() const override
|
|
{
|
|
return mDims;
|
|
}
|
|
|
|
void reshape(nvinfer1::Dims const& dims) override
|
|
{
|
|
Base::resize(nonNegative(volume(dims)));
|
|
mDims = dims;
|
|
}
|
|
|
|
void resize(std::size_t newSize) override
|
|
{
|
|
ITensor::resize(newSize);
|
|
}
|
|
|
|
void release() override
|
|
{
|
|
Base::release();
|
|
mDims.nbDims = 0;
|
|
}
|
|
|
|
private:
|
|
nvinfer1::Dims mDims{};
|
|
};
|
|
|
|
using DeviceTensor = GenericTensor<CudaAllocatorAsync>;
|
|
using StaticDeviceTensor = GenericTensor<CudaAllocator>;
|
|
using HostTensor = GenericTensor<HostAllocator>;
|
|
using PinnedTensor = GenericTensor<PinnedAllocator>;
|
|
using PinnedPoolTensor = GenericTensor<PinnedPoolAllocator>;
|
|
using UVMTensor = GenericTensor<UVMAllocator>;
|
|
using VirtualAddressDeviceTensor = GenericTensor<CudaVirtualMemoryAllocatorAdaptor>;
|
|
|
|
} // namespace tensorrt_llm::runtime
|