TensorRT-LLMs/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp
dongxuy04 490d2e5819
feat: large-scale EP(part 8: Online EP load balancer integration for PCIe fp8) (#5226)
Signed-off-by: Dongxu Yang <78518666+dongxuy04@users.noreply.github.com>
2025-06-25 22:25:13 -07:00

271 lines
8.3 KiB
C++

/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string.h>
#include <cstddef>
#include <cstdlib>
#include <cuda_runtime_api.h>
#include "gdrwrap.h"
#include "hostAccessibleDeviceAllocator.h"
#include "topologyDetector.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h"
namespace tensorrt_llm::runtime
{
bool HostAccessibleDeviceAllocator::mAllowManagedFallback = false;
bool HostAccessibleDeviceAllocator::isSupported()
{
if (TopologyDetector::getInstance().getCurrentGpuMemoryNumaId() >= 0)
{
// we are on systems that GPU memory is also a NUMA node.
return true;
}
if (!tensorrt_llm::runtime::gdrcopy::isInitialized() && !tensorrt_llm::runtime::gdrcopy::initialize())
{
// system don't support GDRCopy.
return mAllowManagedFallback;
}
return true;
}
void HostAccessibleDeviceAllocator::init()
{
TLLM_CHECK(mIsInited == false);
if (getenv("TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK") != nullptr)
{
if (std::string(getenv("TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK")) == "1")
{
mAllowManagedFallback = true;
}
}
TLLM_CUDA_CHECK(cudaGetDevice(&mDevId));
mGpuMemNumaId = TopologyDetector::getInstance().getCurrentGpuMemoryNumaId();
if (mGpuMemNumaId < 0)
{
// We only use GDRCopy when there is no NUMA node for GPU memory.
bool gdrCopyInitedSuccess = true;
if (!tensorrt_llm::runtime::gdrcopy::isInitialized() && !tensorrt_llm::runtime::gdrcopy::initialize())
{
gdrCopyInitedSuccess = false;
}
if (gdrCopyInitedSuccess)
{
mGdrHandle = tensorrt_llm::runtime::gdrcopy::open();
}
}
mIsInited = true;
}
void HostAccessibleDeviceAllocator::shutdown()
{
if (mIsInited == false)
{
return;
}
// We should close GDRCopy handle in the last MoeLoadBalancer,
// But there might be some allocated memory not freed, so we can't close GDRCopy handle.
// So for now, we don't close GDRCopy handle.
#if 0
if (mGdrHandle != nullptr) {
tensorrt_llm::runtime::gdrcopy::close(mGdrHandle);
mGdrHandle = nullptr;
}
#endif
mIsInited = false;
}
HostAccessibleDeviceAllocator& HostAccessibleDeviceAllocator::getInstance()
{
static HostAccessibleDeviceAllocator instance;
return instance;
}
void HostAccessibleDeviceAllocator::IncRefCount()
{
std::lock_guard<std::mutex> lock(mRefMutex);
if (mLoadBalancerCount == 0)
{
init();
}
mLoadBalancerCount++;
}
void HostAccessibleDeviceAllocator::DecRefCount()
{
std::lock_guard<std::mutex> lock(mRefMutex);
mLoadBalancerCount--;
if (mLoadBalancerCount == 0)
{
shutdown();
}
}
void HostAccessibleDeviceAllocator::recordAllocation(
void* devPtr, size_t memorySize, void* hostPtr, gdrcopy::GdrMemDesc* memDesc)
{
std::unique_lock<std::shared_mutex> lock(mAllocationsMutex);
mDeviceAllocations[devPtr] = {memorySize, hostPtr, devPtr, memDesc};
mHostAllocations[hostPtr] = {memorySize, hostPtr, devPtr, memDesc};
}
HostAccessibleDeviceAllocator::AllocationInfo HostAccessibleDeviceAllocator::getAllocationInfoFromHostPtr(
void const* hostPtr)
{
std::shared_lock<std::shared_mutex> lock(mAllocationsMutex);
if (mHostAllocations.empty())
{
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
}
auto it = mHostAllocations.upper_bound(hostPtr);
if (it == mHostAllocations.begin())
{
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
;
}
--it;
return it->second;
}
HostAccessibleDeviceAllocator::AllocationInfo HostAccessibleDeviceAllocator::getAllocationInfoFromDevPtr(
void const* devPtr)
{
std::shared_lock<std::shared_mutex> lock(mAllocationsMutex);
if (mDeviceAllocations.empty())
{
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
}
auto it = mDeviceAllocations.upper_bound(devPtr);
if (it == mDeviceAllocations.begin())
{
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
;
}
--it;
return it->second;
}
void* HostAccessibleDeviceAllocator::getHostPtr(void* devPtr)
{
auto allocationInfo = getAllocationInfoFromDevPtr(devPtr);
if (allocationInfo.devPtr == nullptr)
{
return nullptr;
}
void* recordedDevPtr = allocationInfo.devPtr;
size_t recordedSize = allocationInfo.size;
void* recordedHostPtr = allocationInfo.hostPtr;
auto pDev = static_cast<char*>(devPtr);
auto pRecordedDev = static_cast<char*>(recordedDevPtr);
if (pDev >= pRecordedDev && pDev < (pRecordedDev + recordedSize))
{
ptrdiff_t offset = pDev - pRecordedDev;
return static_cast<char*>(recordedHostPtr) + offset;
}
return nullptr;
}
void HostAccessibleDeviceAllocator::memcpyToDevice(void* dst, void const* src, size_t size)
{
if (mGdrHandle != nullptr)
{
auto allocationInfo = getAllocationInfoFromHostPtr(dst);
TLLM_CHECK(allocationInfo.hostPtr != nullptr);
TLLM_CHECK(allocationInfo.memDesc != nullptr);
tensorrt_llm::runtime::gdrcopy::copy_to_mapping(allocationInfo.memDesc->gdrMh, dst, src, size);
}
else
{
memcpy(dst, src, size);
}
}
void* HostAccessibleDeviceAllocator::allocate(size_t memorySize)
{
int currentDevId = -1;
TLLM_CUDA_CHECK(cudaGetDevice(&currentDevId));
TLLM_CHECK_WITH_INFO(currentDevId == mDevId,
"HostAccessibleDeviceAllocator is not initialized for the current device, currentDevId=%d, mDevId=%d",
currentDevId, mDevId);
TLLM_CHECK_WITH_INFO(isSupported(), "HostAccessibleDeviceAllocator is not supported on the current system.");
void* devPtr = nullptr;
void* hostPtr = nullptr;
gdrcopy::GdrMemDesc* memDesc = nullptr;
if (mGpuMemNumaId >= 0)
{
devPtr = TopologyDetector::getInstance().allocateCurrentGpuNumaMemory(memorySize);
hostPtr = devPtr;
}
else if (mGdrHandle)
{
gdrcopy::gdrCudaMalloc(&hostPtr, &devPtr, memorySize, &memDesc, mGdrHandle);
}
else
{
TLLM_CHECK_WITH_INFO(
mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system.");
TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize));
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
hostPtr = devPtr;
}
recordAllocation(devPtr, memorySize, hostPtr, memDesc);
return devPtr;
}
void HostAccessibleDeviceAllocator::free(void* ptr)
{
std::unique_lock<std::shared_mutex> lock(mAllocationsMutex);
auto it = mDeviceAllocations.find(ptr);
if (it != mDeviceAllocations.end())
{
auto const& allocInfo = it->second;
if (allocInfo.memDesc)
{
gdrcopy::gdrCudaFree(allocInfo.memDesc, mGdrHandle);
}
else if (mGpuMemNumaId >= 0)
{
TopologyDetector::getInstance().freeCurrentGpuNumaMemory(const_cast<void*>(it->first), allocInfo.size);
}
else
{
TLLM_CHECK_WITH_INFO(
mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system.");
TLLM_CUDA_CHECK(cudaFree(ptr));
}
void* hostPtr = it->second.hostPtr;
TLLM_CHECK_WITH_INFO(mHostAllocations.count(hostPtr) == 1, "host pointer not recorded.");
mDeviceAllocations.erase(it);
mHostAllocations.erase(hostPtr);
}
else
{
TLLM_LOG_WARNING("Attempted to free a pointer that was not allocated by HostAccessibleDeviceAllocator.");
}
}
} // namespace tensorrt_llm::runtime