mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
574 lines
20 KiB
C++
574 lines
20 KiB
C++
/*
|
|
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/common/cudaUtils.h"
|
|
#include "tensorrt_llm/runtime/cudaEvent.h"
|
|
#include "tensorrt_llm/runtime/iBuffer.h"
|
|
#include "tensorrt_llm/runtime/memoryCounters.h"
|
|
|
|
#include <atomic>
|
|
#include <cuda.h>
|
|
#include <map>
|
|
#include <mutex>
|
|
#include <numeric>
|
|
#include <unistd.h>
|
|
#include <utility>
|
|
|
|
class VirtualMemoryManagerTest;
|
|
|
|
namespace tensorrt_llm::runtime
|
|
{
|
|
|
|
/**
|
|
* CUDAVirtualMemoryChunk is a handle to a piece of CUDA memory allocation,
|
|
* providing the ability to release and rematerialize the allocation.
|
|
*/
|
|
class CUDAVirtualMemoryChunk
|
|
{
|
|
public:
|
|
/**
|
|
* CUDAVirtualMemoryChunk::Creator is the interface to obtain a CUmemGenericAllocationHandle,
|
|
* either by creating one locally, or importing one from remote.
|
|
*/
|
|
struct Creator
|
|
{
|
|
Creator() = default;
|
|
virtual ~Creator() = default;
|
|
Creator(Creator const&) = default;
|
|
Creator& operator=(Creator const&) = default;
|
|
Creator(Creator&&) = default;
|
|
Creator& operator=(Creator&&) = default;
|
|
|
|
// Note: create() shall not leak resources when throwing exceptions.
|
|
// release() will only, and will always be called if create() success.
|
|
// release() will be called with destructing=true when the CUDAVirtualMemoryChunk
|
|
// is being destructed.
|
|
virtual CUmemGenericAllocationHandle create() = 0;
|
|
virtual void release(CUmemGenericAllocationHandle handle, bool destructing) = 0;
|
|
};
|
|
|
|
using CreatorPtr = std::unique_ptr<Creator>;
|
|
|
|
/**
|
|
* CUDAVirtualMemoryChunk::Configurator is the interface to configure a CUmemGenericAllocationHandle:
|
|
* - Map into virtual address
|
|
* - Bind to multicast object
|
|
* - Backup and restore memory content
|
|
*/
|
|
struct Configurator
|
|
{
|
|
Configurator() = default;
|
|
virtual ~Configurator() = default;
|
|
Configurator(Configurator const&) = default;
|
|
Configurator& operator=(Configurator const&) = default;
|
|
Configurator(Configurator&&) = default;
|
|
Configurator& operator=(Configurator&&) = default;
|
|
|
|
// Note: setup() shall not leak resources when throwing exceptions.
|
|
// teardown() will only, and will always be called if setup() success.
|
|
// teardown() will be called with destructing=true when the CUDAVirtualMemoryChunk
|
|
// is being destructed.
|
|
virtual void setup(CUmemGenericAllocationHandle handle) = 0;
|
|
virtual void teardown(CUmemGenericAllocationHandle handle, bool destructing) = 0;
|
|
};
|
|
|
|
using ConfiguratorPtr = std::unique_ptr<Configurator>;
|
|
using Configurators = std::vector<ConfiguratorPtr>;
|
|
|
|
enum Status
|
|
{
|
|
INVALID, // This is a default constructed invalid CUDAVirtualMemoryChunk.
|
|
RELEASED, // The memory represented by this CUDAVirtualMemoryChunk is not allocated.
|
|
MATERIALIZED, // The memory represented by this CUDAVirtualMemoryChunk is allocated.
|
|
ERRORED, // Error happened during materialize() or release().
|
|
// This CUDAVirtualMemoryChunk cannot be used anymore.
|
|
};
|
|
|
|
[[nodiscard]] Status status() const noexcept
|
|
{
|
|
if (mCreator == nullptr)
|
|
{
|
|
return INVALID;
|
|
}
|
|
|
|
if (mState == 0 && mHandle == 0)
|
|
{
|
|
return RELEASED;
|
|
}
|
|
|
|
if (mState == mConfigurators.size() && mHandle != 0)
|
|
{
|
|
return MATERIALIZED;
|
|
}
|
|
|
|
return ERRORED;
|
|
}
|
|
|
|
/**
|
|
* Materialize this CUDAVirtualMemoryChunk.
|
|
* Shall be called only when status() == RELEASED.
|
|
*
|
|
* Calls creator.create(), and then configurator.setup() for each configurator in order.
|
|
*
|
|
* Stop at the first thrown exception and propagates it.
|
|
*/
|
|
void materialize();
|
|
|
|
/**
|
|
* Release this CUDAVirtualMemoryChunk.
|
|
* Shall be called only when status() == MATERIALIZED, or materialize() throws.
|
|
* Will be called automatically by destructor if necessary.
|
|
*
|
|
* Calls configurator.teardown() for each configurator that setup() succeed in materialize() in reversed order,
|
|
* and then creator.release().
|
|
*
|
|
* Never stops early upon exception. The last thrown exception will be propagated, and others logged.
|
|
*/
|
|
void release()
|
|
{
|
|
_release(false);
|
|
}
|
|
|
|
CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk const&) = delete;
|
|
CUDAVirtualMemoryChunk& operator=(CUDAVirtualMemoryChunk const&) = delete;
|
|
|
|
CUDAVirtualMemoryChunk(CUDAVirtualMemoryChunk&& other) noexcept
|
|
{
|
|
mCreator = std::move(other.mCreator);
|
|
mConfigurators = std::move(other.mConfigurators);
|
|
mHandle = other.mHandle;
|
|
mState = other.mState;
|
|
new (&other) CUDAVirtualMemoryChunk; // Put other into default constructed state
|
|
}
|
|
|
|
CUDAVirtualMemoryChunk& operator=(CUDAVirtualMemoryChunk&& other)
|
|
{
|
|
this->~CUDAVirtualMemoryChunk(); // May throw if current virtual memory need release
|
|
new (this) CUDAVirtualMemoryChunk(std::move(other));
|
|
return *this;
|
|
}
|
|
|
|
CUDAVirtualMemoryChunk() noexcept = default;
|
|
|
|
CUDAVirtualMemoryChunk(CreatorPtr&& creator, Configurators&& configurators)
|
|
: mCreator(std::move(creator))
|
|
, mConfigurators(std::move(configurators))
|
|
{
|
|
}
|
|
|
|
virtual ~CUDAVirtualMemoryChunk()
|
|
{
|
|
// Calling release() is necessary if materialize() succeed or threw an exception.
|
|
// If release() is already called by the user, whether succeed or threw an exception,
|
|
// we shouldn't call release() again.
|
|
if (mHandle != 0 && mState != INVALID_STATE)
|
|
{
|
|
_release(true);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Test if this CUDAVirtualMemoryChunk is managing a memory block.
|
|
*/
|
|
explicit operator bool() const noexcept
|
|
{
|
|
return mCreator != nullptr;
|
|
}
|
|
|
|
private:
|
|
void _release(bool destructing);
|
|
|
|
constexpr static size_t INVALID_STATE = static_cast<size_t>(-1);
|
|
size_t mState = 0;
|
|
CUmemGenericAllocationHandle mHandle{};
|
|
std::unique_ptr<Creator> mCreator;
|
|
std::vector<std::unique_ptr<Configurator>> mConfigurators;
|
|
};
|
|
|
|
/**
|
|
* LocalCreator creates memory allocation locally through cuMemCreate.
|
|
*/
|
|
template <bool count = true>
|
|
struct LocalCreator : CUDAVirtualMemoryChunk::Creator
|
|
{
|
|
LocalCreator(CUmemAllocationProp const& prop, size_t size)
|
|
: mProp(prop)
|
|
, mSize(size)
|
|
{
|
|
}
|
|
|
|
CUmemGenericAllocationHandle create() override
|
|
{
|
|
CUmemGenericAllocationHandle handle{};
|
|
TLLM_CU_CHECK(cuMemCreate(&handle, mSize, &mProp, 0));
|
|
if constexpr (count)
|
|
{
|
|
MemoryCounters::getInstance().allocate(
|
|
mProp.location.type == CU_MEM_LOCATION_TYPE_DEVICE ? MemoryType::kGPU : MemoryType::kPINNED, mSize);
|
|
}
|
|
return handle;
|
|
}
|
|
|
|
void release(CUmemGenericAllocationHandle handle, bool destructing) override
|
|
{
|
|
TLLM_CU_CHECK_FREE_RESOURCE(cuMemRelease(handle));
|
|
if constexpr (count)
|
|
{
|
|
MemoryCounters::getInstance().deallocate(
|
|
mProp.location.type == CU_MEM_LOCATION_TYPE_DEVICE ? MemoryType::kGPU : MemoryType::kPINNED, mSize);
|
|
}
|
|
}
|
|
|
|
CUmemAllocationProp mProp{};
|
|
size_t mSize{};
|
|
};
|
|
|
|
/**
|
|
* UnicastConfigurator maps the allocation handle into the specified unicast address range.
|
|
*/
|
|
struct UnicastConfigurator : CUDAVirtualMemoryChunk::Configurator
|
|
{
|
|
UnicastConfigurator(CUdeviceptr address, size_t size, CUmemAccessDesc const& desc)
|
|
: mAddress(address)
|
|
, mSize(size)
|
|
, mDesc(desc)
|
|
{
|
|
}
|
|
|
|
void setup(CUmemGenericAllocationHandle handle) override
|
|
{
|
|
TLLM_CU_CHECK(cuMemMap(mAddress, mSize, 0, handle, 0));
|
|
TLLM_CU_CHECK(cuMemSetAccess(mAddress, mSize, &mDesc, 1));
|
|
}
|
|
|
|
void teardown(CUmemGenericAllocationHandle, bool) override
|
|
{
|
|
TLLM_CU_CHECK_FREE_RESOURCE(cuMemUnmap(mAddress, mSize));
|
|
}
|
|
|
|
CUdeviceptr mAddress;
|
|
size_t mSize;
|
|
CUmemAccessDesc mDesc;
|
|
};
|
|
|
|
/**
|
|
* MulticastConfigurator binds the allocation handle to the given multicast object and offset.
|
|
*/
|
|
struct MulticastConfigurator : CUDAVirtualMemoryChunk::Configurator
|
|
{
|
|
void setup(CUmemGenericAllocationHandle handle) override
|
|
{
|
|
TLLM_CU_CHECK(cuMulticastBindMem(mMulticast, 0, handle, mBindOffset, mSize, 0));
|
|
}
|
|
|
|
void teardown(CUmemGenericAllocationHandle, bool) override
|
|
{
|
|
TLLM_CU_CHECK_FREE_RESOURCE(cuMulticastUnbind(mMulticast, mDevice, 0, mSize));
|
|
}
|
|
|
|
CUmemGenericAllocationHandle mMulticast;
|
|
size_t mBindOffset;
|
|
CUdevice mDevice;
|
|
size_t mSize;
|
|
};
|
|
|
|
/**
|
|
* MemsetConfigurator fills the memory with given value.
|
|
*/
|
|
struct MemsetConfigurator : CUDAVirtualMemoryChunk::Configurator
|
|
{
|
|
MemsetConfigurator(CUdeviceptr address, size_t size, uint8_t value, CUstream stream)
|
|
: mAddress(address)
|
|
, mSize(size)
|
|
, mStream(stream)
|
|
, mValue(value)
|
|
{
|
|
}
|
|
|
|
void setup(CUmemGenericAllocationHandle) override
|
|
{
|
|
if (mFirstTime)
|
|
{
|
|
mFirstTime = false;
|
|
}
|
|
else
|
|
{
|
|
TLLM_CU_CHECK(cuMemsetD8Async(mAddress, mValue, mSize, mStream));
|
|
}
|
|
}
|
|
|
|
void teardown(CUmemGenericAllocationHandle, bool) noexcept override {}
|
|
|
|
CUdeviceptr mAddress;
|
|
size_t mSize;
|
|
CUstream mStream{};
|
|
uint8_t mValue;
|
|
bool mFirstTime = true;
|
|
};
|
|
|
|
/**
|
|
* OffloadConfigurator offload the content of the allocation to the backup storage when teardown,
|
|
* and restore the content on the following setup.
|
|
*/
|
|
struct OffloadConfigurator : CUDAVirtualMemoryChunk::Configurator
|
|
{
|
|
OffloadConfigurator(CUdeviceptr address, size_t size, MemoryType backType, CUstream stream, bool ondemand = false)
|
|
: mAddress(address)
|
|
, mSize(size)
|
|
, mBackType(backType)
|
|
, mStream(stream)
|
|
, mOndemand(ondemand)
|
|
{
|
|
}
|
|
|
|
void setup(CUmemGenericAllocationHandle handle) override;
|
|
void teardown(CUmemGenericAllocationHandle handle, bool destructing) override;
|
|
|
|
CUdeviceptr mAddress;
|
|
size_t mSize;
|
|
MemoryType mBackType;
|
|
CUstream mStream;
|
|
bool mOndemand;
|
|
|
|
IBuffer::UniquePtr mBackedStorage;
|
|
};
|
|
|
|
class CudaVirtualMemoryManager
|
|
{
|
|
public:
|
|
/**
|
|
* Add memory to be managed by this manager.
|
|
* @param handle Unique handle provided to reference this memory in `remove`.
|
|
* @param tag Tag the memory, so this memory can be targeted in `releaseWithTag` and `materializeWithTag`.
|
|
* @param memory The CUDAVirtualMemory object.
|
|
*
|
|
* The memory and internal state will remain valid if any exception is thrown.
|
|
*/
|
|
void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk&& memory);
|
|
|
|
/**
|
|
* Creates and adds memory to be managed by this manager. The created memory is automatically materialized.
|
|
* @param handle Unique handle provided to reference this memory in `remove`.
|
|
* @param tag Tag the memory, so this memory can be targeted in `releaseWithTag` and
|
|
* `materializeWithTag`.
|
|
* @param creator The creator for the memory.
|
|
* @param configurators The configurators for the memory.
|
|
*
|
|
* The internal state will remain valid if any exception is thrown.
|
|
*/
|
|
void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
|
|
CUDAVirtualMemoryChunk::Configurators&& configurators);
|
|
|
|
template <typename... Configurators>
|
|
void add(uintptr_t handle, std::string tag, CUDAVirtualMemoryChunk::CreatorPtr&& creator,
|
|
Configurators&&... configurators)
|
|
{
|
|
add(handle, tag, std::move(creator), {std::forward<Configurators>(configurators)...});
|
|
}
|
|
|
|
/**
|
|
* Remove the memory from the manager.
|
|
* @param handle The handle provided to `add`.
|
|
* @return The CUDAVirtualMemory object. If the handle is unknown, an empty CUDAVirtualMemory will be returned.
|
|
*/
|
|
CUDAVirtualMemoryChunk remove(uintptr_t handle) noexcept;
|
|
|
|
/**
|
|
* Call release for CUDAVirtualMemoryChunk objects with a given tag.
|
|
* @param tag the tag to select target memories.
|
|
* @return Number of objects selected.
|
|
*
|
|
* This function will always call `CUDAVirtualMemoryChunk::release` on all selected objects.
|
|
* The last exception thrown by `CUDAVirtualMemoryChunk::release` will be rethrown, and others will be logged.
|
|
*
|
|
* If any CUDAVirtualMemoryChunk threw an exception during `release`, it will be removed from the manager.
|
|
* Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
|
|
*/
|
|
size_t releaseWithTag(std::string const& tag);
|
|
|
|
/**
|
|
* Call materialize for CUDAVirtualMemoryChunk objects with a given tag.
|
|
* @param tag the tag to select target memories.
|
|
* @return Number of objects selected.
|
|
*
|
|
* This function will stop at the first `CUDAVirtualMemoryChunk::materialize` that throws exception,
|
|
* and attempt to roll back previous successful `materialize` by calling `release`.
|
|
* The exception thrown by `CUDAVirtualMemoryChunk::materialize` will be rethrown,
|
|
* and any exception thrown by `release` will be logged.
|
|
*
|
|
* If any CUDAVirtualMemoryChunk threw an exception during `materialize` or `release`, it will be removed from the
|
|
* manager. Successfully roll backed CUDAVirtualMemoryChunk will not be removed.
|
|
* Call `retrieveBadHandles` to retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception.
|
|
*/
|
|
size_t materializeWithTag(std::string const& tag);
|
|
|
|
/**
|
|
* Retrieve handles of all CUDAVirtualMemoryChunk that got removed due to exception and reset the list.
|
|
* The returned list may not include all removed CUDAVirtualMemoryChunk handles if OOM happened.
|
|
* This method is only for diagnostic purpose, and should not be called concurrently with other methods.
|
|
* @return The handle list.
|
|
*/
|
|
std::vector<uintptr_t> retrieveBadHandles() noexcept;
|
|
|
|
private:
|
|
CUDAVirtualMemoryChunk unsafeRemove(uintptr_t handle) noexcept;
|
|
void addBadHandle(uintptr_t handle) noexcept;
|
|
|
|
struct Entry;
|
|
// Unordered map invalidates iterator upon rehash, so we can only use the ordered map.
|
|
using PointerMemoryMap = std::map<uintptr_t, Entry>;
|
|
using TagEntryMap = std::multimap<std::string, PointerMemoryMap::iterator>;
|
|
|
|
struct Entry
|
|
{
|
|
CUDAVirtualMemoryChunk mMemory;
|
|
TagEntryMap::iterator mEntryIt;
|
|
};
|
|
|
|
std::mutex mMutex;
|
|
PointerMemoryMap mMemories;
|
|
TagEntryMap mEntries;
|
|
std::vector<uintptr_t> mBadHandles;
|
|
|
|
friend VirtualMemoryManagerTest;
|
|
};
|
|
|
|
class CudaVirtualMemoryAllocator
|
|
{
|
|
using CudaStreamPtr = std::shared_ptr<CudaStream>;
|
|
using Pointer = void*;
|
|
|
|
public:
|
|
enum RestoreMode
|
|
{
|
|
NONE, // The memory is not backed. Upon rematerialize, memory has uninitialized content.
|
|
MEMSET, // The memory is memset to zero upon rematerialize.
|
|
CPU, // The memory is backed by normal CPU memory. The content is restored upon rematerialize.
|
|
PINNED // The memory is backed by pinned CPU memory. The content is restored upon rematerialize.
|
|
};
|
|
|
|
class Configuration
|
|
{
|
|
CudaVirtualMemoryManager& mManager;
|
|
std::string mTag;
|
|
CudaStreamPtr mBackStream;
|
|
std::atomic<std::size_t> mAlignment;
|
|
RestoreMode mMode;
|
|
bool mBackground{};
|
|
|
|
friend class CudaVirtualMemoryAllocator;
|
|
friend void setVirtualMemoryAllocator(
|
|
std::string const& tag, RestoreMode mode, std::shared_ptr<CudaStream> backStream);
|
|
|
|
public:
|
|
/**
|
|
* CudaVirtualMemoryAllocator::Configuration
|
|
* @param manager Manager used to track and manage virtual memories
|
|
* @param tag The tag for allocated memories
|
|
* @param mode Backed storage mode
|
|
* @param backStream The CUDA stream used for restoring memory content
|
|
* Note: Virtual Address Allocation is not async. The stream is not used in allocation.
|
|
*/
|
|
Configuration(CudaVirtualMemoryManager& manager, std::string tag, RestoreMode mode, CudaStreamPtr backStream)
|
|
: mManager(manager)
|
|
, mTag(std::move(tag))
|
|
, mBackStream(std::move(backStream))
|
|
, mAlignment(0)
|
|
, mMode(mode)
|
|
{
|
|
}
|
|
|
|
[[nodiscard]] std::size_t aligned(std::size_t n, int device = 0)
|
|
{
|
|
// Lazy loading the alignment, since CUDA driver may yet to be initialized when Configuration is
|
|
// constructed.
|
|
// We have one process for each GPU so caching the value is fine.
|
|
constexpr std::size_t loading = std::numeric_limits<std::size_t>::max();
|
|
std::size_t alignment = 0;
|
|
if (mAlignment.compare_exchange_strong(alignment, loading, std::memory_order_relaxed))
|
|
{
|
|
std::size_t gpuAlignment = 1;
|
|
CUmemAllocationProp const prop{CU_MEM_ALLOCATION_TYPE_PINNED, CU_MEM_HANDLE_TYPE_NONE,
|
|
{
|
|
CU_MEM_LOCATION_TYPE_DEVICE,
|
|
device,
|
|
}};
|
|
TLLM_CU_CHECK(
|
|
cuMemGetAllocationGranularity(&gpuAlignment, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
|
alignment = std::lcm(getpagesize(), gpuAlignment);
|
|
mAlignment.store(alignment, std::memory_order_relaxed);
|
|
}
|
|
else
|
|
{
|
|
// spin wait
|
|
while (alignment == loading)
|
|
{
|
|
#if defined(__x86_64__)
|
|
asm volatile("pause");
|
|
#elif defined(__aarch64__)
|
|
asm volatile("yield");
|
|
#endif
|
|
alignment = mAlignment.load(std::memory_order_relaxed);
|
|
}
|
|
}
|
|
return (n + alignment - 1) / alignment * alignment;
|
|
}
|
|
|
|
// Background configuration, used to indicate no virtual memory allocator is explicitly configured by the user.
|
|
static Configuration backgroundConfiguration;
|
|
|
|
private:
|
|
Configuration(CudaVirtualMemoryManager& manager, std::string tag, RestoreMode mode, CudaStreamPtr backStream,
|
|
bool background)
|
|
: Configuration(manager, std::move(tag), mode, std::move(backStream))
|
|
{
|
|
mBackground = background;
|
|
}
|
|
};
|
|
|
|
explicit CudaVirtualMemoryAllocator(std::shared_ptr<Configuration> config)
|
|
: mConfig(std::move(config))
|
|
{
|
|
}
|
|
|
|
// Tells if this is the background allocator.
|
|
explicit operator bool() const noexcept
|
|
{
|
|
return !mConfig->mBackground;
|
|
}
|
|
|
|
void allocate(Pointer* ptr, std::size_t n, int device) const;
|
|
void deallocate(Pointer ptr, std::size_t n) const;
|
|
|
|
private:
|
|
std::shared_ptr<Configuration> mConfig;
|
|
};
|
|
|
|
} // namespace tensorrt_llm::runtime
|
|
|
|
namespace tensorrt_llm::runtime
|
|
{
|
|
CudaVirtualMemoryManager& getVirtualMemoryManager();
|
|
CudaVirtualMemoryAllocator getVirtualMemoryAllocator();
|
|
void setVirtualMemoryAllocator(
|
|
std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream);
|
|
void clearVirtualMemoryAllocator();
|
|
|
|
} // namespace tensorrt_llm::runtime
|