mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Denis Kayshev <topenkoff@gmail.com> Co-authored-by: akhoroshev <arthoroshev@gmail.com> Co-authored-by: Patrick Reiter Horn <patrick.horn@gmail.com> Update
212 lines
8.3 KiB
C++
212 lines
8.3 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/runtime/cudaStream.h"
|
|
#include "tensorrt_llm/runtime/iBuffer.h"
|
|
#include "tensorrt_llm/runtime/iTensor.h"
|
|
#include <NvInferRuntime.h>
|
|
|
|
#include <cstring>
|
|
#include <memory>
|
|
#include <set>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
class BufferManagerTest;
|
|
|
|
namespace tensorrt_llm::runtime
|
|
{
|
|
|
|
/// @brief Forward declaration as only used through pointer.
|
|
class CudaMemPool;
|
|
|
|
//! \brief A helper class for managing memory on host and device.
|
|
class BufferManager
|
|
{
|
|
public:
|
|
using IBufferPtr = IBuffer::UniquePtr;
|
|
|
|
using ITensorPtr = ITensor::UniquePtr;
|
|
|
|
using CudaStreamPtr = std::shared_ptr<CudaStream>;
|
|
using CudaMemPoolPtr = std::shared_ptr<CudaMemPool>;
|
|
|
|
//! \brief Construct a BufferManager.
|
|
//!
|
|
//! \param[in] cudaStream The cuda stream to use for all operations on GPU (allocation, de-allocation, copying,
|
|
//! etc.).
|
|
explicit BufferManager(CudaStreamPtr stream, bool trimPool = false);
|
|
|
|
//! \brief Destructor.
|
|
~BufferManager()
|
|
{
|
|
if (mTrimPool)
|
|
{
|
|
memoryPoolTrimTo(0);
|
|
}
|
|
}
|
|
|
|
static auto constexpr kBYTE_TYPE = nvinfer1::DataType::kUINT8;
|
|
|
|
//! \brief Allocates an `IBuffer` of the given size on the GPU, using cudaMallocAsync.
|
|
[[nodiscard]] IBufferPtr gpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const;
|
|
|
|
//! \brief Allocates an `ITensor` of the given dimensions on the GPU, using cudaMallocAsync.
|
|
[[nodiscard]] ITensorPtr gpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const;
|
|
|
|
//! \brief Allocates an `IBuffer` of the given size on the GPU, using cudaMalloc.
|
|
[[nodiscard]] static IBufferPtr gpuSync(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates an `ITensor` of the given dimensions on the GPU, using cudaMalloc.
|
|
[[nodiscard]] static ITensorPtr gpuSync(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates an `IBuffer` of the given size on the CPU.
|
|
[[nodiscard]] static IBufferPtr cpu(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates an `ITensor` of the given dimensions on the CPU.
|
|
[[nodiscard]] static ITensorPtr cpu(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates a pinned `IBuffer` of the given size on the CPU.
|
|
[[nodiscard]] static IBufferPtr pinned(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates a pinned `ITensor` of the given dimensions on the CPU.
|
|
[[nodiscard]] static ITensorPtr pinned(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates a pinned `IBuffer` of the given size on the CPU in the default memory pool.
|
|
[[nodiscard]] static IBufferPtr pinnedPool(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates a pinned `ITensor` of the given dimensions on the CPU in the default memory pool.
|
|
[[nodiscard]] static ITensorPtr pinnedPool(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates an `IBuffer` of the given size in UVM.
|
|
[[nodiscard]] static IBufferPtr managed(std::size_t size, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates an `ITensor` of the given dimensions in UVM.
|
|
[[nodiscard]] static ITensorPtr managed(nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE);
|
|
|
|
//! \brief Allocates an `ITensor` of the given dimensions for NVLS
|
|
[[nodiscard]] static ITensorPtr ipcNvls(std::set<int> ranks, nvinfer1::Dims dims, nvinfer1::DataType type);
|
|
|
|
//! \brief Allocates an `IBuffer` of the given size and memory type.
|
|
[[nodiscard]] IBufferPtr allocate(
|
|
MemoryType memoryType, std::size_t size, nvinfer1::DataType type = kBYTE_TYPE) const;
|
|
|
|
//! \brief Allocates an `ITensor` of the given dimensions and memory type.
|
|
[[nodiscard]] ITensorPtr allocate(
|
|
MemoryType memoryType, nvinfer1::Dims dims, nvinfer1::DataType type = kBYTE_TYPE) const;
|
|
|
|
//! \brief Create an empty `IBuffer` of the given memory type. It may be resized later.
|
|
[[nodiscard]] IBufferPtr emptyBuffer(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
|
|
{
|
|
return allocate(memoryType, 0, type);
|
|
}
|
|
|
|
//! \brief Create an empty `ITensor` of the given memory type. It may be reshaped later.
|
|
[[nodiscard]] ITensorPtr emptyTensor(MemoryType memoryType, nvinfer1::DataType type = kBYTE_TYPE) const
|
|
{
|
|
return allocate(memoryType, ITensor::makeShape({}), type);
|
|
}
|
|
|
|
//! \brief Set the contents of the given `buffer` to value.
|
|
void setMem(IBuffer& buffer, int32_t value) const;
|
|
|
|
//! \brief Set the contents of the given `buffer` to zero.
|
|
void setZero(IBuffer& buffer) const;
|
|
|
|
//! \brief Copy `src` to `dst`.
|
|
void copy(void const* src, IBuffer& dst, MemoryType srcType) const;
|
|
|
|
//! \brief Copy `src` to `dst`.
|
|
void copy(IBuffer const& src, void* dst, MemoryType dstType) const;
|
|
|
|
//! \brief Copy `src` to `dst`.
|
|
void copy(void const* src, IBuffer& dst) const
|
|
{
|
|
return copy(src, dst, IBuffer::memoryType(src));
|
|
}
|
|
|
|
//! \brief Copy `src` to `dst`.
|
|
void copy(IBuffer const& src, void* dst) const
|
|
{
|
|
return copy(src, dst, IBuffer::memoryType(dst));
|
|
}
|
|
|
|
//! \brief Copy `src` to `dst`.
|
|
void copy(IBuffer const& src, IBuffer& dst) const;
|
|
|
|
//! \brief Copy `src` into a new `IBuffer` with a potentially different memory type.
|
|
[[nodiscard]] IBufferPtr copyFrom(IBuffer const& src, MemoryType memoryType) const;
|
|
|
|
//! \brief Copy `src` into a new `ITensor` with a potentially different memory type.
|
|
[[nodiscard]] ITensorPtr copyFrom(ITensor const& src, MemoryType memoryType) const;
|
|
|
|
//! \brief Copy `src` into a new `IBuffer` with a potentially different memory type.
|
|
template <typename T>
|
|
[[nodiscard]] IBufferPtr copyFrom(std::vector<T> const& src, MemoryType memoryType) const
|
|
{
|
|
auto buffer = allocate(memoryType, src.size(), TRTDataType<std::remove_cv_t<T>>::value);
|
|
copy(src.data(), *buffer);
|
|
return buffer;
|
|
}
|
|
|
|
//! \brief Copy `src` into a new `ITensor` with a potentially different memory type.
|
|
template <typename T>
|
|
[[nodiscard]] ITensorPtr copyFrom(T* src, nvinfer1::Dims dims, MemoryType memoryType) const
|
|
{
|
|
auto buffer = allocate(memoryType, dims, TRTDataType<std::remove_cv_t<T>>::value);
|
|
copy(src, *buffer);
|
|
return buffer;
|
|
}
|
|
|
|
//! \brief Copy `src` into a new `ITensor` with a potentially different memory type.
|
|
template <typename T>
|
|
[[nodiscard]] ITensorPtr copyFrom(std::vector<T> const& src, nvinfer1::Dims dims, MemoryType memoryType) const
|
|
{
|
|
TLLM_CHECK_WITH_INFO(src.size() == ITensor::volumeNonNegative(dims),
|
|
common::fmtstr("[TensorRT-LLM][ERROR] Incompatible size %lu and dims %s", src.size(),
|
|
ITensor::toString(dims).c_str()));
|
|
return copyFrom(src.data(), dims, memoryType);
|
|
}
|
|
|
|
//! \brief Get the underlying cuda stream.
|
|
[[nodiscard]] CudaStream const& getStream() const;
|
|
|
|
//! \brief The current size of the memory reserved by the memory pool.
|
|
[[nodiscard]] std::size_t memoryPoolReserved() const;
|
|
|
|
//! \brief The current size of the memory used by the memory pool.
|
|
[[nodiscard]] std::size_t memoryPoolUsed() const;
|
|
|
|
//! \brief The current size of the memory free in the memory pool.
|
|
[[nodiscard]] std::size_t memoryPoolFree() const;
|
|
|
|
//! \brief Try to trim the memory reserved by the pool to `size` bytes. This synchronizes implicitly with the
|
|
//! stream.
|
|
void memoryPoolTrimTo(std::size_t size);
|
|
|
|
private:
|
|
friend class ::BufferManagerTest;
|
|
|
|
CudaStreamPtr mStream;
|
|
CudaMemPoolPtr mPool;
|
|
bool const mTrimPool;
|
|
};
|
|
|
|
} // namespace tensorrt_llm::runtime
|