/* * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/cudaEvent.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/memoryCounters.h" #include #include #include #include #include #include #include class VirtualMemoryManagerTest; namespace tensorrt_llm::runtime { /** * CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation, * providing the ability to release and rematerialize the allocation. */ class CUDAVirtualMemoryChunk { public: /** * CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle, * either by creating one locally, or importing one from remote. */ struct Creator { Creator() = default; virtual ~Creator() = default; Creator(Creator const&) = default; Creator& operator=(Creator const&) = default; Creator(Creator&&) = default; Creator& operator=(Creator&&) = default; // Note: create() shall not leak resources when throwing exceptions. // release() will only, and will always be called if create() success. // release() will be called with destructing=true when the CUDAVirtualMemoryChunk // is being destructed. virtual CUmemGenericAllocationHandle create() = 0; virtual void release(CUmemGenericAllocationHandle handle, bool destructing) = 0; }; using CreatorPtr = std::unique_ptr; /** * CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle: * - Map into virtual address * - Bind to multicast object * - Backup and restore memory content */ struct Configurator { Configurator() = default; virtual ~Configurator() = default; Configurator(Configurator const&) = default; Configurator& operator=(Configurator const&) = default; Configurator(Configurator&&) = default; Configurator& operator=(Configurator&&) = default; // Note: setup() shall not leak resources when throwing exceptions. // teardown() will only, and will always be called if setup() success. // teardown() will be called with destructing=true when the CUDAVirtualMemoryChunk // is being destructed. virtual void setup(CUmemGenericAllocationHandle handle) = 0; virtual void teardown(CUmemGenericAllocationHandle handle, bool destructing) = 0; }; using ConfiguratorPtr = std::unique_ptr; using Configurators = std::vector; enum Status { INVALID, // This is a default constructed invalid CUDAVirtualMemoryChunk. RELEASED, // The memory represented by this CUDAVirtualMemoryChunk is not allocated. MATERIALIZED, // The memory represented by this CUDAVirtualMemoryChunk is allocated. ERRORED, // Error happened during materialize() or release(). // This CUDAVirtualMemoryChunk cannot be used anymore. }; [[nodiscard]] Status status() const noexcept { if (mCreator == nullptr) { return INVALID; } if (mState == 0 && mHandle == 0) { return RELEASED; } if (mState == mConfigurators.size() && mHandle != 0) { return MATERIALIZED; } return ERRORED; } /** * Materialize this CUDAVirtualMemoryChunk. * Shall be called only when status() == RELEASED. * * Calls creator.create(), and then configurator.setup() for each configurator in order. * * Stop at the first thrown exception and propagates it. */ void materialize(); /** * Release this CUDAVirtualMemoryChunk. * Shall be called only when status() == MATERIALIZED, or materialize() throws. * Will be called automatically by destructor if necessary. * * Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order, * and then creator.release(). * * Never stops early upon exception. The last thrown exception will be propagated, and others logged. */ void release() { _release(false); } CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk const&) = delete; CUDAVirtualMemoryChunk& operator=(CUDAVirtualMemoryChunk const&) = delete; CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk&& other) noexcept { mCreator = std::move(other.mCreator); mConfigurators = std::move(other.mConfigurators); mHandle = other.mHandle; mState = other.mState; new (&other) CUDAVirtualMemoryChunk; // Put other into default constructed state } CUDAVirtualMemoryChunk& operator=(CUDAVirtualMemoryChunk&& other) { this->~CUDAVirtualMemoryChunk(); // May throw if current virtual memory need release new (this) CUDAVirtualMemoryChunk(std::move(other)); return *this; } CUDAVirtualMemoryChunk() noexcept = default; CUDAVirtualMemoryChunk(CreatorPtr&& creator, Configurators&& configurators) : mCreator(std::move(creator)) , mConfigurators(std::move(configurators)) { } virtual ~CUDAVirtualMemoryChunk() { // Calling release() is necessary if materialize() succeed or threw an exception. // If release() is already called by the user, whether succeed or threw an exception, // we shouldn't call release() again. if (mHandle != 0 && mState != INVALID_STATE) { _release(true); } } /** * Test if this CUDAVirtualMemoryChunk is managing a memory block. */ explicit operator bool() const noexcept { return mCreator != nullptr; } private: void _release(bool destructing); constexpr static size_t INVALID_STATE = static_cast(-1); size_t mState = 0; CUmemGenericAllocationHandle mHandle{}; std::unique_ptr mCreator; std::vector> mConfigurators; }; /** * LocalCreator creates memory allocation locally through cuMemCreate. */ template struct LocalCreator : CUDAVirtualMemoryChunk::Creator { LocalCreator(CUmemAllocationProp const& prop, size_t size) : mProp(prop) , mSize(size) { } CUmemGenericAllocationHandle create() override { CUmemGenericAllocationHandle handle{}; TLLM_CU_CHECK(cuMemCreate(&handle, mSize, &mProp, 0)); if constexpr (count) { MemoryCounters::getInstance().allocate( mProp.location.type == CU_MEM_LOCATION_TYPE_DEVICE ? MemoryType::kGPU : MemoryType::kPINNED, mSize); } return handle; } void release(CUmemGenericAllocationHandle handle, bool destructing) override { TLLM_CU_CHECK_FREE_RESOURCE(cuMemRelease(handle)); if constexpr (count) { MemoryCounters::getInstance().deallocate( mProp.location.type == CU_MEM_LOCATION_TYPE_DEVICE ? MemoryType::kGPU : MemoryType::kPINNED, mSize); } } CUmemAllocationProp mProp{}; size_t mSize{}; }; /** * UnicastConfigurator maps the allocation handle into the specified unicast address range. */ struct UnicastConfigurator : CUDAVirtualMemoryChunk::Configurator { UnicastConfigurator(CUdeviceptr address, size_t size, CUmemAccessDesc const& desc) : mAddress(address) , mSize(size) , mDesc(desc) { } void setup(CUmemGenericAllocationHandle handle) override { TLLM_CU_CHECK(cuMemMap(mAddress, mSize, 0, handle, 0)); TLLM_CU_CHECK(cuMemSetAccess(mAddress, mSize, &mDesc, 1)); } void teardown(CUmemGenericAllocationHandle, bool) override { TLLM_CU_CHECK_FREE_RESOURCE(cuMemUnmap(mAddress, mSize)); } CUdeviceptr mAddress; size_t mSize; CUmemAccessDesc mDesc; }; /** * MulticastConfigurator binds the allocation handle to the given multicast object and offset. */ struct MulticastConfigurator : CUDAVirtualMemoryChunk::Configurator { void setup(CUmemGenericAllocationHandle handle) override { TLLM_CU_CHECK(cuMulticastBindMem(mMulticast, 0, handle, mBindOffset, mSize, 0)); } void teardown(CUmemGenericAllocationHandle, bool) override { TLLM_CU_CHECK_FREE_RESOURCE(cuMulticastUnbind(mMulticast, mDevice, 0, mSize)); } CUmemGenericAllocationHandle mMulticast; size_t mBindOffset; CUdevice mDevice; size_t mSize; }; /** * MemsetConfigurator fills the memory with given value. */ struct MemsetConfigurator : CUDAVirtualMemoryChunk::Configurator { MemsetConfigurator(CUdeviceptr address, size_t size, uint8_t value, CUstream stream) : mAddress(address) , mSize(size) , mStream(stream) , mValue(value) { } void setup(CUmemGenericAllocationHandle) override { if (mFirstTime) { mFirstTime = false; } else { TLLM_CU_CHECK(cuMemsetD8Async(mAddress, mValue, mSize, mStream)); } } void teardown(CUmemGenericAllocationHandle, bool) noexcept override {} CUdeviceptr mAddress; size_t mSize; CUstream mStream{}; uint8_t mValue; bool mFirstTime = true; }; /** * OffloadConfigurator offload the content of the allocation to the backup storage when teardown, * and restore the content on the following setup. */ struct OffloadConfigurator : CUDAVirtualMemoryChunk::Configurator { OffloadConfigurator(CUdeviceptr address, size_t size, MemoryType backType, CUstream stream, bool ondemand = false) : mAddress(address) , mSize(size) , mBackType(backType) , mStream(stream) , mOndemand(ondemand) { } void setup(CUmemGenericAllocationHandle handle) override; void teardown(CUmemGenericAllocationHandle handle, bool destructing) override; CUdeviceptr mAddress; size_t mSize; MemoryType mBackType; CUstream mStream; bool mOndemand; IBuffer::UniquePtr mBackedStorage; }; class CudaVirtualMemoryManager { public: /** * Add memory to be managed by this manager. * @param handle Unique handle provided to reference this memory in `remove`. * @param tag Tag the memory, so this memory can be targeted in `releaseWithTag` and `materializeWithTag`. * @param memory The CUDAVirtualMemory object. * * The memory and internal state will remain valid if any exception is thrown. */ void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk&& memory); /** * Creates and adds memory to be managed by this manager. The created memory is automatically materialized. * @param handle Unique handle provided to reference this memory in `remove`. * @param tag Tag the memory, so this memory can be targeted in `releaseWithTag` and * `materializeWithTag`. * @param creator The creator for the memory. * @param configurators The configurators for the memory. * * The internal state will remain valid if any exception is thrown. */ void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk::CreatorPtr&& creator, CUDAVirtualMemoryChunk::Configurators&& configurators); template void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk::CreatorPtr&& creator, Configurators&&... configurators) { add(handle, tag, std::move(creator), {std::forward(configurators)...}); } /** * Remove the memory from the manager. * @param handle The handle provided to `add`. * @return The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned. */ CUDAVirtualMemoryChunk remove(uintptr_t handle) noexcept; /** * Call release for CUDAVirtualMemoryChunk objects with a given tag. * @param tag the tag to select target memories. * @return Number of objects selected. * * This function will always call `CUDAVirtualMemoryChunk::release` on all selected objects. * The last exception thrown by `CUDAVirtualMemoryChunk::release` will be rethrown, and others will be logged. * * If any CUDAVirtualMemoryChunk threw an exception during `release`, it will be removed from the manager. * Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception. */ size_t releaseWithTag(std::string const& tag); /** * Call materialize for CUDAVirtualMemoryChunk objects with a given tag. * @param tag the tag to select target memories. * @return Number of objects selected. * * This function will stop at the first `CUDAVirtualMemoryChunk::materialize` that throws exception, * and attempt to roll back previous successful `materialize` by calling `release`. * The exception thrown by `CUDAVirtualMemoryChunk::materialize` will be rethrown, * and any exception thrown by `release` will be logged. * * If any CUDAVirtualMemoryChunk threw an exception during `materialize` or `release`, it will be removed from the * manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed. * Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception. */ size_t materializeWithTag(std::string const& tag); /** * Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list. * The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened. * This method is only for diagnostic purpose, and should not be called concurrently with other methods. * @return The handle list. */ std::vector retrieveBadHandles() noexcept; private: CUDAVirtualMemoryChunk unsafeRemove(uintptr_t handle) noexcept; void addBadHandle(uintptr_t handle) noexcept; struct Entry; // Unordered map invalidates iterator upon rehash, so we can only use the ordered map. using PointerMemoryMap = std::map; using TagEntryMap = std::multimap; struct Entry { CUDAVirtualMemoryChunk mMemory; TagEntryMap::iterator mEntryIt; }; std::mutex mMutex; PointerMemoryMap mMemories; TagEntryMap mEntries; std::vector mBadHandles; friend VirtualMemoryManagerTest; }; class CudaVirtualMemoryAllocator { using CudaStreamPtr = std::shared_ptr; using Pointer = void*; public: enum RestoreMode { NONE, // The memory is not backed. Upon rematerialize, memory has uninitialized content. MEMSET, // The memory is memset to zero upon rematerialize. CPU, // The memory is backed by normal CPU memory. The content is restored upon rematerialize. PINNED // The memory is backed by pinned CPU memory. The content is restored upon rematerialize. }; class Configuration { CudaVirtualMemoryManager& mManager; std::string mTag; CudaStreamPtr mBackStream; std::atomic mAlignment; RestoreMode mMode; bool mBackground{}; friend class CudaVirtualMemoryAllocator; friend void setVirtualMemoryAllocator( std::string const& tag, RestoreMode mode, std::shared_ptr backStream); public: /** * CudaVirtualMemoryAllocator::Configuration * @param manager Manager used to track and manage virtual memories * @param tag The tag for allocated memories * @param mode Backed storage mode * @param backStream The CUDA stream used for restoring memory content * Note: Virtual Address Allocation is not async. The stream is not used in allocation. */ Configuration(CudaVirtualMemoryManager& manager, std::string tag, RestoreMode mode, CudaStreamPtr backStream) : mManager(manager) , mTag(std::move(tag)) , mBackStream(std::move(backStream)) , mAlignment(0) , mMode(mode) { } [[nodiscard]] std::size_t aligned(std::size_t n, int device = 0) { // Lazy loading the alignment, since CUDA driver may yet to be initialized when Configuration is // constructed. // We have one process for each GPU so caching the value is fine. constexpr std::size_t loading = std::numeric_limits::max(); std::size_t alignment = 0; if (mAlignment.compare_exchange_strong(alignment, loading, std::memory_order_relaxed)) { std::size_t gpuAlignment = 1; CUmemAllocationProp const prop{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE, { CU_MEM_LOCATION_TYPE_DEVICE, device, }}; TLLM_CU_CHECK( cuMemGetAllocationGranularity(&gpuAlignment, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); alignment = std::lcm(getpagesize(), gpuAlignment); mAlignment.store(alignment, std::memory_order_relaxed); } else { // spin wait while (alignment == loading) { #if defined(__x86_64__) asm volatile("pause"); #elif defined(__aarch64__) asm volatile("yield"); #endif alignment = mAlignment.load(std::memory_order_relaxed); } } return (n + alignment - 1) / alignment * alignment; } // Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user. static Configuration backgroundConfiguration; private: Configuration(CudaVirtualMemoryManager& manager, std::string tag, RestoreMode mode, CudaStreamPtr backStream, bool background) : Configuration(manager, std::move(tag), mode, std::move(backStream)) { mBackground = background; } }; explicit CudaVirtualMemoryAllocator(std::shared_ptr config) : mConfig(std::move(config)) { } // Tells if this is the background allocator. explicit operator bool() const noexcept { return !mConfig->mBackground; } void allocate(Pointer* ptr, std::size_t n, int device) const; void deallocate(Pointer ptr, std::size_t n) const; private: std::shared_ptr mConfig; }; } // namespace tensorrt_llm::runtime namespace tensorrt_llm::runtime { CudaVirtualMemoryManager& getVirtualMemoryManager(); CudaVirtualMemoryAllocator getVirtualMemoryAllocator(); void setVirtualMemoryAllocator( std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr backStream); void clearVirtualMemoryAllocator(); } // namespace tensorrt_llm::runtime