/* * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/bufferManager.h" #include #include using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; class BufferManagerTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init) { protected: void SetUp() override { mDeviceCount = tc::getDeviceCount(); if (mDeviceCount > 0) { mStream = std::make_unique(); } else { GTEST_SKIP(); } } void TearDown() override {} int mDeviceCount; BufferManager::CudaStreamPtr mStream; }; namespace { template T convertType(std::size_t val) { return static_cast(val); } template <> half convertType(std::size_t val) { return __float2half_rn(static_cast(val)); } template void testRoundTrip(BufferManager& manager) { auto constexpr size = 128; std::vector inputCpu(size); for (std::size_t i = 0; i < size; ++i) { inputCpu[i] = convertType(i); } auto inputGpu = manager.copyFrom(inputCpu, MemoryType::kGPU); auto outputCpu = manager.copyFrom(*inputGpu, MemoryType::kPINNED); EXPECT_EQ(inputCpu.size(), outputCpu->getSize()); manager.getStream().synchronize(); auto outputCpuTyped = bufferCast(*outputCpu); for (size_t i = 0; i < inputCpu.size(); ++i) { EXPECT_EQ(inputCpu[i], outputCpuTyped[i]); } manager.setZero(*inputGpu); manager.copy(*inputGpu, *outputCpu); manager.getStream().synchronize(); for (size_t i = 0; i < inputCpu.size(); ++i) { EXPECT_EQ(0, static_cast(outputCpuTyped[i])); } } } // namespace TEST_F(BufferManagerTest, CreateCopyRoundTrip) { BufferManager manager(mStream); testRoundTrip(manager); testRoundTrip(manager); testRoundTrip(manager); testRoundTrip(manager); testRoundTrip(manager); } TEST_F(BufferManagerTest, Pointers) { // This could be any C++ type supported by TensorRT. using cppBaseType = TokenIdType; // We want to store pointers to the C++ base type in the buffer. using cppPointerType = cppBaseType*; // This represents the TensorRT type for the pointer. auto constexpr trtPointerType = TRTDataType::value; static_assert(std::is_same_v); static_assert(trtPointerType.isPointer()); static_assert(trtPointerType.getDataType() == TRTDataType::value); static_assert(static_cast(trtPointerType) == BufferDataType::kTrtPointerType); static_assert(trtPointerType == BufferDataType::kTrtPointerType); // uses implicit type conversion // The C++ type corresponding to the TensorRT type for storing pointers (int64_t) using cppStorageType = DataTypeTraits::type; static_assert(sizeof(cppStorageType) == sizeof(cppPointerType)); BufferManager manager(mStream); auto constexpr batchSize = 16; // This buffer is on the CPU for convenient testing. In real code, this would be on the GPU. auto pointers = manager.allocate(MemoryType::kCPU, batchSize, trtPointerType); // We cast to the correct C++ pointer type checking that the underlying storage type is int64_t. auto pointerBuf = bufferCast(*pointers); // Create the GPU tensors. std::vector tensors(batchSize); auto constexpr beamWidth = 4; auto constexpr maxSeqLen = 10; auto const shape = ITensor::makeShape({beamWidth, maxSeqLen}); for (auto i = 0u; i < batchSize; ++i) { tensors[i] = manager.allocate(MemoryType::kGPU, shape, TRTDataType::value); pointerBuf[i] = bufferCast(*tensors[i]); } // Test that all pointers are valid for (auto i = 0u; i < batchSize; ++i) { EXPECT_EQ(pointerBuf[i], tensors[i]->data()); } } TEST_F(BufferManagerTest, MemPoolAttributes) { BufferManager manager(mStream); // sets attributes of the default memory pool auto const device = mStream->getDevice(); ::cudaMemPool_t memPool; TLLM_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&memPool, device)); std::uint64_t threshold{0}; TLLM_CUDA_CHECK(cudaMemPoolGetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, &threshold)); EXPECT_EQ(threshold, std::numeric_limits::max()); manager.memoryPoolTrimTo(0); auto const reserved = manager.memoryPoolReserved(); auto const used = manager.memoryPoolUsed(); auto const free = manager.memoryPoolFree(); EXPECT_EQ(free, reserved - used); auto constexpr kBytesToReserve = 1 << 20; { auto const mem = manager.allocate(MemoryType::kGPU, kBytesToReserve); EXPECT_EQ(mem->getSize(), kBytesToReserve); EXPECT_GE(manager.memoryPoolReserved(), reserved + kBytesToReserve); EXPECT_GE(manager.memoryPoolUsed(), used + kBytesToReserve); } EXPECT_GE(manager.memoryPoolFree(), free + kBytesToReserve); manager.memoryPoolTrimTo(0); EXPECT_LE(manager.memoryPoolReserved(), reserved); EXPECT_LE(manager.memoryPoolFree(), free); }