mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
271 lines
8.3 KiB
C++
271 lines
8.3 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <string.h>
|
|
|
|
#include <cstddef>
|
|
#include <cstdlib>
|
|
#include <cuda_runtime_api.h>
|
|
|
|
#include "gdrwrap.h"
|
|
#include "hostAccessibleDeviceAllocator.h"
|
|
#include "topologyDetector.h"
|
|
|
|
#include "tensorrt_llm/common/cudaUtils.h"
|
|
#include "tensorrt_llm/common/logger.h"
|
|
|
|
namespace tensorrt_llm::runtime
|
|
{
|
|
|
|
bool HostAccessibleDeviceAllocator::mAllowManagedFallback = false;
|
|
|
|
bool HostAccessibleDeviceAllocator::isSupported()
|
|
{
|
|
if (TopologyDetector::getInstance().getCurrentGpuMemoryNumaId() >= 0)
|
|
{
|
|
// we are on systems that GPU memory is also a NUMA node.
|
|
return true;
|
|
}
|
|
if (!tensorrt_llm::runtime::gdrcopy::isInitialized() && !tensorrt_llm::runtime::gdrcopy::initialize())
|
|
{
|
|
// system don't support GDRCopy.
|
|
return mAllowManagedFallback;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void HostAccessibleDeviceAllocator::init()
|
|
{
|
|
TLLM_CHECK(mIsInited == false);
|
|
|
|
if (getenv("TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK") != nullptr)
|
|
{
|
|
if (std::string(getenv("TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK")) == "1")
|
|
{
|
|
mAllowManagedFallback = true;
|
|
}
|
|
}
|
|
|
|
TLLM_CUDA_CHECK(cudaGetDevice(&mDevId));
|
|
mGpuMemNumaId = TopologyDetector::getInstance().getCurrentGpuMemoryNumaId();
|
|
if (mGpuMemNumaId < 0)
|
|
{
|
|
// We only use GDRCopy when there is no NUMA node for GPU memory.
|
|
bool gdrCopyInitedSuccess = true;
|
|
if (!tensorrt_llm::runtime::gdrcopy::isInitialized() && !tensorrt_llm::runtime::gdrcopy::initialize())
|
|
{
|
|
gdrCopyInitedSuccess = false;
|
|
}
|
|
if (gdrCopyInitedSuccess)
|
|
{
|
|
mGdrHandle = tensorrt_llm::runtime::gdrcopy::open();
|
|
}
|
|
}
|
|
mIsInited = true;
|
|
}
|
|
|
|
void HostAccessibleDeviceAllocator::shutdown()
|
|
{
|
|
if (mIsInited == false)
|
|
{
|
|
return;
|
|
}
|
|
// We should close GDRCopy handle in the last MoeLoadBalancer,
|
|
// But there might be some allocated memory not freed, so we can't close GDRCopy handle.
|
|
// So for now, we don't close GDRCopy handle.
|
|
#if 0
|
|
if (mGdrHandle != nullptr) {
|
|
tensorrt_llm::runtime::gdrcopy::close(mGdrHandle);
|
|
mGdrHandle = nullptr;
|
|
}
|
|
#endif
|
|
mIsInited = false;
|
|
}
|
|
|
|
HostAccessibleDeviceAllocator& HostAccessibleDeviceAllocator::getInstance()
|
|
{
|
|
static HostAccessibleDeviceAllocator instance;
|
|
return instance;
|
|
}
|
|
|
|
void HostAccessibleDeviceAllocator::IncRefCount()
|
|
{
|
|
std::lock_guard<std::mutex> lock(mRefMutex);
|
|
if (mLoadBalancerCount == 0)
|
|
{
|
|
init();
|
|
}
|
|
mLoadBalancerCount++;
|
|
}
|
|
|
|
void HostAccessibleDeviceAllocator::DecRefCount()
|
|
{
|
|
std::lock_guard<std::mutex> lock(mRefMutex);
|
|
mLoadBalancerCount--;
|
|
if (mLoadBalancerCount == 0)
|
|
{
|
|
shutdown();
|
|
}
|
|
}
|
|
|
|
void HostAccessibleDeviceAllocator::recordAllocation(
|
|
void* devPtr, size_t memorySize, void* hostPtr, gdrcopy::GdrMemDesc* memDesc)
|
|
{
|
|
std::unique_lock<std::shared_mutex> lock(mAllocationsMutex);
|
|
mDeviceAllocations[devPtr] = {memorySize, hostPtr, devPtr, memDesc};
|
|
mHostAllocations[hostPtr] = {memorySize, hostPtr, devPtr, memDesc};
|
|
}
|
|
|
|
HostAccessibleDeviceAllocator::AllocationInfo HostAccessibleDeviceAllocator::getAllocationInfoFromHostPtr(
|
|
void const* hostPtr)
|
|
{
|
|
std::shared_lock<std::shared_mutex> lock(mAllocationsMutex);
|
|
if (mHostAllocations.empty())
|
|
{
|
|
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
|
|
}
|
|
auto it = mHostAllocations.upper_bound(hostPtr);
|
|
if (it == mHostAllocations.begin())
|
|
{
|
|
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
|
|
;
|
|
}
|
|
--it;
|
|
return it->second;
|
|
}
|
|
|
|
HostAccessibleDeviceAllocator::AllocationInfo HostAccessibleDeviceAllocator::getAllocationInfoFromDevPtr(
|
|
void const* devPtr)
|
|
{
|
|
std::shared_lock<std::shared_mutex> lock(mAllocationsMutex);
|
|
if (mDeviceAllocations.empty())
|
|
{
|
|
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
|
|
}
|
|
auto it = mDeviceAllocations.upper_bound(devPtr);
|
|
if (it == mDeviceAllocations.begin())
|
|
{
|
|
return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr};
|
|
;
|
|
}
|
|
--it;
|
|
return it->second;
|
|
}
|
|
|
|
void* HostAccessibleDeviceAllocator::getHostPtr(void* devPtr)
|
|
{
|
|
auto allocationInfo = getAllocationInfoFromDevPtr(devPtr);
|
|
if (allocationInfo.devPtr == nullptr)
|
|
{
|
|
return nullptr;
|
|
}
|
|
void* recordedDevPtr = allocationInfo.devPtr;
|
|
size_t recordedSize = allocationInfo.size;
|
|
void* recordedHostPtr = allocationInfo.hostPtr;
|
|
|
|
auto pDev = static_cast<char*>(devPtr);
|
|
auto pRecordedDev = static_cast<char*>(recordedDevPtr);
|
|
|
|
if (pDev >= pRecordedDev && pDev < (pRecordedDev + recordedSize))
|
|
{
|
|
ptrdiff_t offset = pDev - pRecordedDev;
|
|
return static_cast<char*>(recordedHostPtr) + offset;
|
|
}
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
void HostAccessibleDeviceAllocator::memcpyToDevice(void* dst, void const* src, size_t size)
|
|
{
|
|
if (mGdrHandle != nullptr)
|
|
{
|
|
auto allocationInfo = getAllocationInfoFromHostPtr(dst);
|
|
TLLM_CHECK(allocationInfo.hostPtr != nullptr);
|
|
TLLM_CHECK(allocationInfo.memDesc != nullptr);
|
|
tensorrt_llm::runtime::gdrcopy::copy_to_mapping(allocationInfo.memDesc->gdrMh, dst, src, size);
|
|
}
|
|
else
|
|
{
|
|
memcpy(dst, src, size);
|
|
}
|
|
}
|
|
|
|
void* HostAccessibleDeviceAllocator::allocate(size_t memorySize)
|
|
{
|
|
int currentDevId = -1;
|
|
TLLM_CUDA_CHECK(cudaGetDevice(¤tDevId));
|
|
TLLM_CHECK_WITH_INFO(currentDevId == mDevId,
|
|
"HostAccessibleDeviceAllocator is not initialized for the current device, currentDevId=%d, mDevId=%d",
|
|
currentDevId, mDevId);
|
|
TLLM_CHECK_WITH_INFO(isSupported(), "HostAccessibleDeviceAllocator is not supported on the current system.");
|
|
void* devPtr = nullptr;
|
|
void* hostPtr = nullptr;
|
|
gdrcopy::GdrMemDesc* memDesc = nullptr;
|
|
if (mGpuMemNumaId >= 0)
|
|
{
|
|
devPtr = TopologyDetector::getInstance().allocateCurrentGpuNumaMemory(memorySize);
|
|
hostPtr = devPtr;
|
|
}
|
|
else if (mGdrHandle)
|
|
{
|
|
gdrcopy::gdrCudaMalloc(&hostPtr, &devPtr, memorySize, &memDesc, mGdrHandle);
|
|
}
|
|
else
|
|
{
|
|
TLLM_CHECK_WITH_INFO(
|
|
mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system.");
|
|
TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize));
|
|
TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId));
|
|
hostPtr = devPtr;
|
|
}
|
|
recordAllocation(devPtr, memorySize, hostPtr, memDesc);
|
|
return devPtr;
|
|
}
|
|
|
|
void HostAccessibleDeviceAllocator::free(void* ptr)
|
|
{
|
|
std::unique_lock<std::shared_mutex> lock(mAllocationsMutex);
|
|
auto it = mDeviceAllocations.find(ptr);
|
|
if (it != mDeviceAllocations.end())
|
|
{
|
|
auto const& allocInfo = it->second;
|
|
if (allocInfo.memDesc)
|
|
{
|
|
gdrcopy::gdrCudaFree(allocInfo.memDesc, mGdrHandle);
|
|
}
|
|
else if (mGpuMemNumaId >= 0)
|
|
{
|
|
TopologyDetector::getInstance().freeCurrentGpuNumaMemory(const_cast<void*>(it->first), allocInfo.size);
|
|
}
|
|
else
|
|
{
|
|
TLLM_CHECK_WITH_INFO(
|
|
mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system.");
|
|
TLLM_CUDA_CHECK(cudaFree(ptr));
|
|
}
|
|
void* hostPtr = it->second.hostPtr;
|
|
TLLM_CHECK_WITH_INFO(mHostAllocations.count(hostPtr) == 1, "host pointer not recorded.");
|
|
mDeviceAllocations.erase(it);
|
|
mHostAllocations.erase(hostPtr);
|
|
}
|
|
else
|
|
{
|
|
TLLM_LOG_WARNING("Attempted to free a pointer that was not allocated by HostAccessibleDeviceAllocator.");
|
|
}
|
|
}
|
|
|
|
} // namespace tensorrt_llm::runtime
|