TensorRT-LLMs/cpp/tensorrt_llm/runtime/decodingLayerWorkspace.h
2024-11-12 15:27:49 +08:00

202 lines
9.2 KiB
C++

/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#include "tensorrt_llm/common/dataType.h"
#include "tensorrt_llm/common/workspace.h"
#include "tensorrt_llm/layers/decodingParams.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/tllmBuffers.h"
namespace tensorrt_llm::runtime
{
///@brief A collection of shared resources and data for the decoding layers.
class DecodingLayerWorkspace
{
public:
using TensorPtr = ITensor::SharedPtr;
using TensorUniquePtr = ITensor::UniquePtr;
using TensorConstPtr = ITensor::SharedConstPtr;
using BufferPtr = IBuffer::SharedPtr;
DecodingLayerWorkspace(std::shared_ptr<BufferManager> bufferManager, layers::DecoderDomain const& decoderDomain,
nvinfer1::DataType logitsType, size_t workspaceBufferSizeInBytes);
DecodingLayerWorkspace() = delete;
DecodingLayerWorkspace(DecodingLayerWorkspace const& decodingLayerWorkspace) = delete;
///@brief Gets a pointer to the start of the shared device workspace.
[[nodiscard]] void* getRawWorkspaceDevicePtr() const;
///@brief Gets a pointer to the start of the shared device workspace, as a pointer to the given type.
template <typename T>
T* getWorkspaceDevicePtrAs() const
{
return reinterpret_cast<T*>(mWorkspaceDeviceBuffer->data());
};
///@brief Gets a pointer to the buffer backing the device workspace.
[[nodiscard]] BufferPtr getWorkspaceDeviceBuffer() const;
///@brief Sets the value of the device copy of the batch slots.
void setDeviceBatchSlots(TensorConstPtr const& newBatchSlots);
///@brief Gets the pointer to the batch slots on device.
[[nodiscard]] SizeType32 const* getDeviceBatchSlotsPtr() const;
///@brief Gets the device tensor containing the batch slots.
[[nodiscard]] TensorConstPtr getDeviceBatchSlots() const;
///@brief Gets the device tensor containing the runtime logits.
[[nodiscard]] TensorPtr getDeviceRuntimeLogits() const;
///@brief Gets a tensor with the given shape and type at the start of the device workspace.
TensorPtr getWorkspaceAsDeviceTensor(ITensor::Shape shape, nvinfer1::DataType type);
/// @brief A convenience function to copy the content of a standard vector to a device workspace.
template <typename T, typename Alloc>
static void copyToWorkspace(runtime::BufferManager const& bufferManager, std::vector<T, Alloc> const& src,
runtime::IBuffer::SharedPtr workspace)
{
auto const sizeOfWorkspaceInBytes = workspace->getSizeInBytes();
auto const sizeOfSrcInBytes = sizeof(T) * src.size();
TLLM_CHECK_WITH_INFO(sizeOfSrcInBytes <= sizeOfWorkspaceInBytes,
"The size of the workspace (%zu bytes) is insufficient for the data (%zu bytes)", sizeOfWorkspaceInBytes,
sizeOfSrcInBytes);
auto const sizePerWorkspaceElement = BufferDataType(workspace->getDataType()).getSize();
TLLM_CHECK_WITH_INFO(sizePerWorkspaceElement == 1 || sizePerWorkspaceElement == sizeof(T),
"Copy to typed workspace, but element size mismatched (src: %zu, workspace: %zu)", sizeof(T),
sizePerWorkspaceElement);
runtime::IBuffer::SharedPtr workspaceSlice
= runtime::IBuffer::slice(workspace, 0, sizeOfSrcInBytes / sizePerWorkspaceElement);
bufferManager.copy(src.data(), *workspaceSlice, runtime::MemoryType::kCPU);
}
/// @brief A convenience function to copy the content of a standard vector to the workspace.
template <typename T>
TensorPtr copyToWorkspace(std::vector<T> const& src)
{
copyToWorkspace(*mBufferManager, src, mWorkspaceDeviceBuffer);
return getWorkspaceAsDeviceTensor(
ITensor::makeShape({static_cast<SizeType32>(src.size())}), TRTDataType<T>::value);
}
///@brief Ensures the workspace has at least the provided space in bytes. Does nothing if the workspace is already
/// at least as large.
void resize(size_t minSize);
///@brief Given a collection of tuples of tensor shapes and data types, returns the memory aligned size required to
/// contain those tensors.
template <typename... Args>
size_t static calculateRequiredWorkspaceSize(Args&&... args)
{
size_t lastTensorOffset = 0;
auto alignedSizeCalculator
= [&lastTensorOffset](std::pair<ITensor::Shape, nvinfer1::DataType> const& tensorDescriptor)
{
auto const& [shape, type] = tensorDescriptor;
auto const sizeInBytes = ITensor::volume(shape) * tensorrt_llm::common::getDTypeSize(type);
auto const sliceEnd = lastTensorOffset + sizeInBytes;
lastTensorOffset = tensorrt_llm::common::alignSize(sliceEnd, tensorrt_llm::common::kCudaMemAlign);
};
auto argTuple = std::make_tuple(std::forward<Args>(args)...);
forEach(alignedSizeCalculator, argTuple);
return lastTensorOffset;
}
///@brief Given a collection of tensors, creates tensors with the same shape and data types in the workspace and
/// copies the data from the input tensors to their reflection on device.
template <typename... Args>
auto mirrorInWorkspace(Args&&... args)
{
auto* lastTensorEndPtr = reinterpret_cast<std::int8_t*>(mWorkspaceDeviceBuffer->data());
auto tensorFactory = [&lastTensorEndPtr, this](auto const& tensor)
{
if (tensor == nullptr)
{
return std::unique_ptr<GenericTensor<BorrowingAllocator<MemoryType::kGPU>>>{};
}
auto const sizeInBytes = tensor->getSizeInBytes();
auto const borrowingAllocator = BorrowingAllocator<MemoryType::kGPU>{lastTensorEndPtr, sizeInBytes};
auto res = std::make_unique<GenericTensor<BorrowingAllocator<MemoryType::kGPU>>>(
tensor->getShape(), tensor->getDataType(), borrowingAllocator);
auto const sliceEnd = lastTensorEndPtr + sizeInBytes;
lastTensorEndPtr = tensorrt_llm::common::alignPtr(sliceEnd, tensorrt_llm::common::kCudaMemAlign);
mBufferManager->copy(*tensor, *res);
return res;
};
auto argTuple = std::make_tuple(std::forward<Args>(args)...);
auto res = transform(tensorFactory, argTuple);
std::size_t const numArgs = sizeof...(Args);
std::size_t const sizeInBytes
= lastTensorEndPtr - reinterpret_cast<std::int8_t*>(mWorkspaceDeviceBuffer->data());
TLLM_LOG_DEBUG("Borrowing %lu bytes of the workspace for %i tensors.", sizeInBytes, numArgs);
return res;
}
/// @brief A convenience function to initialize curand states from a provided seed.
void initializeDeviceCurandStates(std::optional<std::vector<uint64_t>> const& randomSeed,
runtime::SizeType32 batchSize, TensorConstPtr const& batchSlots, TensorPtr& statesDevice);
private:
std::shared_ptr<BufferManager> mBufferManager;
TensorPtr mBatchSlotsDevice; // <! A copy of the batch slots on device ensure fast access when used in kernels.
TensorPtr mRuntimeLogitsDevice; // <! The working state of the logits while decoding.
TensorPtr
mCurandStatesDevice; // <! The state information of the random number generators for sampling based decoding.
BufferPtr mWorkspaceDeviceBuffer; // <! A buffer to be used as scratch space by the decoding layers.
cudaStream_t getStream();
///@brief A helper template to apply a function to each element of a tuple and return a tuple of the results.
template <typename Func, typename Tuple, std::size_t... I>
auto static transformImpl(Func&& func, Tuple&& tuple, std::index_sequence<I...>)
{
return std::make_tuple(func(std::get<I>(tuple))...);
}
///@brief A helper template to apply a function to each element of a tuple and return a tuple of the results.
template <typename Func, typename... Args>
auto static transform(Func&& func, std::tuple<Args...> const& tuple)
{
return transformImpl(std::forward<Func>(func), tuple, std::index_sequence_for<Args...>{});
}
///@brief A helper template to apply a function to each element of a tuple.
template <typename Func, typename Tuple, std::size_t... I>
void static forEachImpl(Func&& func, Tuple&& tuple, std::index_sequence<I...>)
{
(func(std::get<I>(tuple)), ...);
}
///@brief A helper template to apply a function to each element of a tuple.
template <typename Func, typename... Args>
void static forEach(Func&& func, std::tuple<Args...> const& tuple)
{
forEachImpl(std::forward<Func>(func), tuple, std::index_sequence_for<Args...>{});
}
};
} // namespace tensorrt_llm::runtime