mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
201 lines
9.4 KiB
C++
201 lines
9.4 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "tensorrt_llm/runtime/decodingLayerWorkspace.h"
|
|
#include "tensorrt_llm/common/cudaUtils.h"
|
|
#include "tensorrt_llm/common/workspace.h"
|
|
#include <gtest/gtest.h>
|
|
#include <random>
|
|
|
|
using namespace tensorrt_llm;
|
|
|
|
namespace
|
|
{
|
|
void populateCpuBufferWithRandomBytes(uint64_t seed, runtime::IBuffer& buffer)
|
|
{
|
|
std::mt19937 generator(seed);
|
|
std::uniform_int_distribution<int> distribution(0, 255);
|
|
auto* bufferPtr = reinterpret_cast<unsigned char*>(buffer.data());
|
|
for (size_t i = 0; i < buffer.getSizeInBytes(); ++i)
|
|
{
|
|
*(bufferPtr + i) = static_cast<unsigned char>(distribution(generator));
|
|
}
|
|
}
|
|
|
|
bool areMemoryRegionsEqual(void const* ptr1, void const* ptr2, size_t size)
|
|
{
|
|
// Use std::memcmp to compare the memory regions
|
|
return std::memcmp(ptr1, ptr2, size) == 0;
|
|
}
|
|
|
|
void testBufferEqual(runtime::IBuffer& left, runtime::IBuffer& right)
|
|
{
|
|
auto const size = left.getSizeInBytes();
|
|
ASSERT_EQ(size, right.getSizeInBytes());
|
|
ASSERT_TRUE(areMemoryRegionsEqual(left.data(), right.data(), size));
|
|
}
|
|
} // namespace
|
|
|
|
auto const maxBatchSizePowersOfTwo = testing::Range(0, 14, 1);
|
|
auto const workspaceSizePowersOfTwo = testing::Range(0, 30, 2);
|
|
|
|
auto const initialBatchAndWorkspaceSizes = testing::Combine(maxBatchSizePowersOfTwo, workspaceSizePowersOfTwo);
|
|
|
|
using BasicUsageParamType = std::tuple<int32_t, int32_t>;
|
|
|
|
class BasicUsageTest : public testing::TestWithParam<BasicUsageParamType>
|
|
{
|
|
void SetUp() override
|
|
{
|
|
auto const deviceCount = common::getDeviceCount();
|
|
if (deviceCount > 0)
|
|
{
|
|
mBufferManager = std::make_shared<runtime::BufferManager>(std::make_unique<runtime::CudaStream>());
|
|
}
|
|
else
|
|
{
|
|
GTEST_SKIP() << "This test suite cannot run on systems with no devices.";
|
|
}
|
|
}
|
|
|
|
protected:
|
|
std::shared_ptr<runtime::BufferManager> mBufferManager = nullptr;
|
|
};
|
|
|
|
TEST_P(BasicUsageTest, TestBasicUsageOfDecodingLayerWorkspace)
|
|
{
|
|
auto const [maxBatchSizePowerOfTwo, workspaceSizePowerOfTwo] = GetParam();
|
|
auto const maxBatchSize = static_cast<runtime::SizeType32>(std::pow(2, maxBatchSizePowerOfTwo));
|
|
auto const workspaceSizeInBytes = static_cast<size_t>(std::pow(2, workspaceSizePowerOfTwo));
|
|
auto const decoderDomain = tensorrt_llm::layers::DecoderDomain(maxBatchSize, 1, 1000, 1024);
|
|
|
|
// Testing constructing the workspace.
|
|
auto workspace = runtime::DecodingLayerWorkspace(
|
|
mBufferManager, decoderDomain, tensorrt_llm::runtime::TRTDataType<float>::value, workspaceSizeInBytes);
|
|
mBufferManager->getStream().synchronize();
|
|
ASSERT_EQ(workspace.getWorkspaceDeviceBuffer()->getSizeInBytes(), workspaceSizeInBytes)
|
|
<< "The workspace size is not equal to the size we asked it to be.";
|
|
ASSERT_EQ(workspace.getDeviceBatchSlots()->getSize(), maxBatchSize)
|
|
<< "The size of the device batch slots is not the max batch size provided to the workspace";
|
|
|
|
// Testing enlarging the workspace.
|
|
workspace.resize(workspaceSizeInBytes / 2);
|
|
ASSERT_EQ(workspace.getWorkspaceDeviceBuffer()->getSizeInBytes(), workspaceSizeInBytes)
|
|
<< "The workspace size should not shrink.";
|
|
auto const biggerWorkspaceSize = workspaceSizeInBytes * 2;
|
|
workspace.resize(biggerWorkspaceSize);
|
|
ASSERT_EQ(workspace.getWorkspaceDeviceBuffer()->getSizeInBytes(), biggerWorkspaceSize)
|
|
<< "The workspace was not enlarged as expected";
|
|
|
|
// Checking that the device batch slots are actually on device
|
|
auto const deviceBatchSlots = workspace.getDeviceBatchSlots();
|
|
ASSERT_EQ(deviceBatchSlots->getMemoryType(), runtime::MemoryType::kGPU)
|
|
<< "The device batch slots should be on device.";
|
|
|
|
auto const* deviceBatchSlotsPtr = workspace.getDeviceBatchSlotsPtr();
|
|
ASSERT_EQ(tensorrt_llm::common::getPtrCudaMemoryType(deviceBatchSlotsPtr), cudaMemoryType::cudaMemoryTypeDevice)
|
|
<< "Pointer to device batch slots should have cudaMemoryType = device.";
|
|
}
|
|
|
|
INSTANTIATE_TEST_SUITE_P(BasicUsage, BasicUsageTest, initialBatchAndWorkspaceSizes);
|
|
|
|
auto const randomSeeds = testing::Values(static_cast<std::uint64_t>(1234));
|
|
auto const tensorDimensions = testing::Values(10, 100);
|
|
auto const tensorDataTypes = testing::Values(runtime::TRTDataType<bool>::value, runtime::TRTDataType<half>::value,
|
|
runtime::TRTDataType<float>::value, runtime::TRTDataType<float*>::value);
|
|
auto const tensorDataTypesTuples = testing::Combine(tensorDataTypes, tensorDataTypes, tensorDataTypes);
|
|
|
|
auto const tensorShapeTuples = testing::Combine(tensorDimensions, tensorDimensions, tensorDimensions);
|
|
auto const mirrorInWorkspaceParams = testing::Combine(tensorDataTypesTuples, tensorShapeTuples, randomSeeds);
|
|
|
|
using MirrorInWorkspaceParamType = std::tuple<std::tuple<nvinfer1::DataType, nvinfer1::DataType, nvinfer1::DataType>,
|
|
std::tuple<std::int32_t, std::int32_t, std::int32_t>, std::uint64_t>;
|
|
|
|
class MirrorInWorkspaceTest : public testing::TestWithParam<MirrorInWorkspaceParamType>
|
|
{
|
|
void SetUp() override
|
|
{
|
|
auto const deviceCount = common::getDeviceCount();
|
|
if (deviceCount > 0)
|
|
{
|
|
mBufferManager = std::make_shared<runtime::BufferManager>(std::make_unique<runtime::CudaStream>());
|
|
}
|
|
else
|
|
{
|
|
GTEST_SKIP() << "This test suite cannot run on systems with no devices.";
|
|
}
|
|
}
|
|
|
|
protected:
|
|
std::shared_ptr<runtime::BufferManager> mBufferManager = nullptr;
|
|
};
|
|
|
|
TEST_P(MirrorInWorkspaceTest, TestMirrorInWorkspaceFunctionality)
|
|
{
|
|
auto const [tensorDataTypes, tensorDimensions, randomSeed] = GetParam();
|
|
auto const [tensorDataType1, tensorDataType2, tensorDataType3] = tensorDataTypes;
|
|
auto const [tensorDimension1, tensorDimension2, tensorDimension3] = tensorDimensions;
|
|
auto const decoderDomain = tensorrt_llm::layers::DecoderDomain(128, 1, 1000, 1024);
|
|
|
|
// Testing constructing the workspace.
|
|
auto const hostTensorShape1
|
|
= tensorrt_llm::runtime::ITensor::makeShape({tensorDimension1, tensorDimension2, tensorDimension3});
|
|
auto const hostTensorShape2
|
|
= tensorrt_llm::runtime::ITensor::makeShape({tensorDimension2, tensorDimension3, tensorDimension1});
|
|
auto const hostTensorShape3
|
|
= tensorrt_llm::runtime::ITensor::makeShape({tensorDimension3, tensorDimension1, tensorDimension2});
|
|
runtime::ITensor::SharedPtr const hostTensor1 = mBufferManager->cpu(hostTensorShape1, tensorDataType1);
|
|
runtime::ITensor::SharedPtr const hostTensor2 = mBufferManager->cpu(hostTensorShape1, tensorDataType2);
|
|
runtime::ITensor::SharedPtr const hostTensor3 = mBufferManager->cpu(hostTensorShape1, tensorDataType3);
|
|
|
|
auto const requiredWorkspaceSize = tensorrt_llm::runtime::DecodingLayerWorkspace::calculateRequiredWorkspaceSize(
|
|
std::make_pair(hostTensorShape1, tensorDataType1), std::make_pair(hostTensorShape2, tensorDataType2),
|
|
std::make_pair(hostTensorShape3, tensorDataType3));
|
|
auto workspace = runtime::DecodingLayerWorkspace(
|
|
mBufferManager, decoderDomain, tensorrt_llm::runtime::TRTDataType<float>::value, requiredWorkspaceSize);
|
|
mBufferManager->getStream().synchronize();
|
|
|
|
ASSERT_LE(hostTensor1->getSizeInBytes() + hostTensor2->getSizeInBytes() + hostTensor3->getSizeInBytes(),
|
|
requiredWorkspaceSize)
|
|
<< "The calculated workspace size cannot possibly be enough to contain all the tensors.";
|
|
|
|
constexpr std::size_t addressAlignment = tensorrt_llm::common::kCudaMemAlign;
|
|
constexpr std::size_t numTensors = 3;
|
|
constexpr std::size_t maxAlignmentOverhead = numTensors * addressAlignment;
|
|
ASSERT_GE(hostTensor1->getSizeInBytes() + hostTensor2->getSizeInBytes() + hostTensor3->getSizeInBytes()
|
|
+ maxAlignmentOverhead,
|
|
requiredWorkspaceSize)
|
|
<< "We probably overestimate the amount of space the workspace requires.";
|
|
populateCpuBufferWithRandomBytes(randomSeed, *hostTensor1);
|
|
populateCpuBufferWithRandomBytes(randomSeed, *hostTensor2);
|
|
populateCpuBufferWithRandomBytes(randomSeed, *hostTensor3);
|
|
|
|
auto const [deviceTensor1, deviceTensor2, deviceTensor3]
|
|
= workspace.mirrorInWorkspace(hostTensor1, hostTensor2, hostTensor3);
|
|
|
|
runtime::ITensor::SharedPtr const hostTensorCopy1 = mBufferManager->cpu(hostTensorShape1, tensorDataType1);
|
|
runtime::ITensor::SharedPtr const hostTensorCopy2 = mBufferManager->cpu(hostTensorShape1, tensorDataType2);
|
|
runtime::ITensor::SharedPtr const hostTensorCopy3 = mBufferManager->cpu(hostTensorShape1, tensorDataType3);
|
|
mBufferManager->copy(*deviceTensor1, *hostTensorCopy1);
|
|
mBufferManager->copy(*deviceTensor2, *hostTensorCopy2);
|
|
mBufferManager->copy(*deviceTensor3, *hostTensorCopy3);
|
|
testBufferEqual(*hostTensor1, *hostTensorCopy1);
|
|
testBufferEqual(*hostTensor2, *hostTensorCopy2);
|
|
testBufferEqual(*hostTensor3, *hostTensorCopy3);
|
|
}
|
|
|
|
INSTANTIATE_TEST_SUITE_P(MirrorInWorkspace, MirrorInWorkspaceTest, mirrorInWorkspaceParams);
|