/* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/runtime/decodingLayerWorkspace.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/workspace.h" #include #include using namespace tensorrt_llm; namespace { void populateCpuBufferWithRandomBytes(uint64_t seed, runtime::IBuffer& buffer) { std::mt19937 generator(seed); std::uniform_int_distribution distribution(0, 255); auto* bufferPtr = reinterpret_cast(buffer.data()); for (size_t i = 0; i < buffer.getSizeInBytes(); ++i) { *(bufferPtr + i) = static_cast(distribution(generator)); } } bool areMemoryRegionsEqual(void const* ptr1, void const* ptr2, size_t size) { // Use std::memcmp to compare the memory regions return std::memcmp(ptr1, ptr2, size) == 0; } void testBufferEqual(runtime::IBuffer& left, runtime::IBuffer& right) { auto const size = left.getSizeInBytes(); ASSERT_EQ(size, right.getSizeInBytes()); ASSERT_TRUE(areMemoryRegionsEqual(left.data(), right.data(), size)); } } // namespace auto const maxBatchSizePowersOfTwo = testing::Range(0, 14, 1); auto const workspaceSizePowersOfTwo = testing::Range(0, 30, 2); auto const initialBatchAndWorkspaceSizes = testing::Combine(maxBatchSizePowersOfTwo, workspaceSizePowersOfTwo); using BasicUsageParamType = std::tuple; class BasicUsageTest : public testing::TestWithParam { void SetUp() override { auto const deviceCount = common::getDeviceCount(); if (deviceCount > 0) { mBufferManager = std::make_shared(std::make_unique()); } else { GTEST_SKIP() << "This test suite cannot run on systems with no devices."; } } protected: std::shared_ptr mBufferManager = nullptr; }; TEST_P(BasicUsageTest, TestBasicUsageOfDecodingLayerWorkspace) { auto const [maxBatchSizePowerOfTwo, workspaceSizePowerOfTwo] = GetParam(); auto const maxBatchSize = static_cast(std::pow(2, maxBatchSizePowerOfTwo)); auto const workspaceSizeInBytes = static_cast(std::pow(2, workspaceSizePowerOfTwo)); auto const decoderDomain = tensorrt_llm::layers::DecoderDomain(maxBatchSize, 1, 1000, 1024); // Testing constructing the workspace. auto workspace = runtime::DecodingLayerWorkspace( mBufferManager, decoderDomain, tensorrt_llm::runtime::TRTDataType::value, workspaceSizeInBytes); mBufferManager->getStream().synchronize(); ASSERT_EQ(workspace.getWorkspaceDeviceBuffer()->getSizeInBytes(), workspaceSizeInBytes) << "The workspace size is not equal to the size we asked it to be."; ASSERT_EQ(workspace.getDeviceBatchSlots()->getSize(), maxBatchSize) << "The size of the device batch slots is not the max batch size provided to the workspace"; // Testing enlarging the workspace. workspace.resize(workspaceSizeInBytes / 2); ASSERT_EQ(workspace.getWorkspaceDeviceBuffer()->getSizeInBytes(), workspaceSizeInBytes) << "The workspace size should not shrink."; auto const biggerWorkspaceSize = workspaceSizeInBytes * 2; workspace.resize(biggerWorkspaceSize); ASSERT_EQ(workspace.getWorkspaceDeviceBuffer()->getSizeInBytes(), biggerWorkspaceSize) << "The workspace was not enlarged as expected"; // Checking that the device batch slots are actually on device auto const deviceBatchSlots = workspace.getDeviceBatchSlots(); ASSERT_EQ(deviceBatchSlots->getMemoryType(), runtime::MemoryType::kGPU) << "The device batch slots should be on device."; auto const* deviceBatchSlotsPtr = workspace.getDeviceBatchSlotsPtr(); ASSERT_EQ(tensorrt_llm::common::getPtrCudaMemoryType(deviceBatchSlotsPtr), cudaMemoryType::cudaMemoryTypeDevice) << "Pointer to device batch slots should have cudaMemoryType = device."; } INSTANTIATE_TEST_SUITE_P(BasicUsage, BasicUsageTest, initialBatchAndWorkspaceSizes); auto const randomSeeds = testing::Values(static_cast(1234)); auto const tensorDimensions = testing::Values(10, 100); auto const tensorDataTypes = testing::Values(runtime::TRTDataType::value, runtime::TRTDataType::value, runtime::TRTDataType::value, runtime::TRTDataType::value); auto const tensorDataTypesTuples = testing::Combine(tensorDataTypes, tensorDataTypes, tensorDataTypes); auto const tensorShapeTuples = testing::Combine(tensorDimensions, tensorDimensions, tensorDimensions); auto const mirrorInWorkspaceParams = testing::Combine(tensorDataTypesTuples, tensorShapeTuples, randomSeeds); using MirrorInWorkspaceParamType = std::tuple, std::tuple, std::uint64_t>; class MirrorInWorkspaceTest : public testing::TestWithParam { void SetUp() override { auto const deviceCount = common::getDeviceCount(); if (deviceCount > 0) { mBufferManager = std::make_shared(std::make_unique()); } else { GTEST_SKIP() << "This test suite cannot run on systems with no devices."; } } protected: std::shared_ptr mBufferManager = nullptr; }; TEST_P(MirrorInWorkspaceTest, TestMirrorInWorkspaceFunctionality) { auto const [tensorDataTypes, tensorDimensions, randomSeed] = GetParam(); auto const [tensorDataType1, tensorDataType2, tensorDataType3] = tensorDataTypes; auto const [tensorDimension1, tensorDimension2, tensorDimension3] = tensorDimensions; auto const decoderDomain = tensorrt_llm::layers::DecoderDomain(128, 1, 1000, 1024); // Testing constructing the workspace. auto const hostTensorShape1 = tensorrt_llm::runtime::ITensor::makeShape({tensorDimension1, tensorDimension2, tensorDimension3}); auto const hostTensorShape2 = tensorrt_llm::runtime::ITensor::makeShape({tensorDimension2, tensorDimension3, tensorDimension1}); auto const hostTensorShape3 = tensorrt_llm::runtime::ITensor::makeShape({tensorDimension3, tensorDimension1, tensorDimension2}); runtime::ITensor::SharedPtr const hostTensor1 = mBufferManager->cpu(hostTensorShape1, tensorDataType1); runtime::ITensor::SharedPtr const hostTensor2 = mBufferManager->cpu(hostTensorShape1, tensorDataType2); runtime::ITensor::SharedPtr const hostTensor3 = mBufferManager->cpu(hostTensorShape1, tensorDataType3); auto const requiredWorkspaceSize = tensorrt_llm::runtime::DecodingLayerWorkspace::calculateRequiredWorkspaceSize( std::make_pair(hostTensorShape1, tensorDataType1), std::make_pair(hostTensorShape2, tensorDataType2), std::make_pair(hostTensorShape3, tensorDataType3)); auto workspace = runtime::DecodingLayerWorkspace( mBufferManager, decoderDomain, tensorrt_llm::runtime::TRTDataType::value, requiredWorkspaceSize); mBufferManager->getStream().synchronize(); ASSERT_LE(hostTensor1->getSizeInBytes() + hostTensor2->getSizeInBytes() + hostTensor3->getSizeInBytes(), requiredWorkspaceSize) << "The calculated workspace size cannot possibly be enough to contain all the tensors."; constexpr std::size_t addressAlignment = tensorrt_llm::common::kCudaMemAlign; constexpr std::size_t numTensors = 3; constexpr std::size_t maxAlignmentOverhead = numTensors * addressAlignment; ASSERT_GE(hostTensor1->getSizeInBytes() + hostTensor2->getSizeInBytes() + hostTensor3->getSizeInBytes() + maxAlignmentOverhead, requiredWorkspaceSize) << "We probably overestimate the amount of space the workspace requires."; populateCpuBufferWithRandomBytes(randomSeed, *hostTensor1); populateCpuBufferWithRandomBytes(randomSeed, *hostTensor2); populateCpuBufferWithRandomBytes(randomSeed, *hostTensor3); auto const [deviceTensor1, deviceTensor2, deviceTensor3] = workspace.mirrorInWorkspace(hostTensor1, hostTensor2, hostTensor3); runtime::ITensor::SharedPtr const hostTensorCopy1 = mBufferManager->cpu(hostTensorShape1, tensorDataType1); runtime::ITensor::SharedPtr const hostTensorCopy2 = mBufferManager->cpu(hostTensorShape1, tensorDataType2); runtime::ITensor::SharedPtr const hostTensorCopy3 = mBufferManager->cpu(hostTensorShape1, tensorDataType3); mBufferManager->copy(*deviceTensor1, *hostTensorCopy1); mBufferManager->copy(*deviceTensor2, *hostTensorCopy2); mBufferManager->copy(*deviceTensor3, *hostTensorCopy3); testBufferEqual(*hostTensor1, *hostTensorCopy1); testBufferEqual(*hostTensor2, *hostTensorCopy2); testBufferEqual(*hostTensor3, *hostTensorCopy3); } INSTANTIATE_TEST_SUITE_P(MirrorInWorkspace, MirrorInWorkspaceTest, mirrorInWorkspaceParams);