/* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include "gdrwrap.h" #include "hostAccessibleDeviceAllocator.h" #include "topologyDetector.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" namespace tensorrt_llm::runtime { bool HostAccessibleDeviceAllocator::mAllowManagedFallback = false; bool HostAccessibleDeviceAllocator::isSupported() { if (TopologyDetector::getInstance().getCurrentGpuMemoryNumaId() >= 0) { // we are on systems that GPU memory is also a NUMA node. return true; } if (!tensorrt_llm::runtime::gdrcopy::isInitialized() && !tensorrt_llm::runtime::gdrcopy::initialize()) { // system don't support GDRCopy. return mAllowManagedFallback; } return true; } void HostAccessibleDeviceAllocator::init() { TLLM_CHECK(mIsInited == false); if (getenv("TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK") != nullptr) { if (std::string(getenv("TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK")) == "1") { mAllowManagedFallback = true; } } TLLM_CUDA_CHECK(cudaGetDevice(&mDevId)); mGpuMemNumaId = TopologyDetector::getInstance().getCurrentGpuMemoryNumaId(); if (mGpuMemNumaId < 0) { // We only use GDRCopy when there is no NUMA node for GPU memory. bool gdrCopyInitedSuccess = true; if (!tensorrt_llm::runtime::gdrcopy::isInitialized() && !tensorrt_llm::runtime::gdrcopy::initialize()) { gdrCopyInitedSuccess = false; } if (gdrCopyInitedSuccess) { mGdrHandle = tensorrt_llm::runtime::gdrcopy::open(); } } mIsInited = true; } void HostAccessibleDeviceAllocator::shutdown() { if (mIsInited == false) { return; } // We should close GDRCopy handle in the last MoeLoadBalancer, // But there might be some allocated memory not freed, so we can't close GDRCopy handle. // So for now, we don't close GDRCopy handle. #if 0 if (mGdrHandle != nullptr) { tensorrt_llm::runtime::gdrcopy::close(mGdrHandle); mGdrHandle = nullptr; } #endif mIsInited = false; } HostAccessibleDeviceAllocator& HostAccessibleDeviceAllocator::getInstance() { static HostAccessibleDeviceAllocator instance; return instance; } void HostAccessibleDeviceAllocator::IncRefCount() { std::lock_guard lock(mRefMutex); if (mLoadBalancerCount == 0) { init(); } mLoadBalancerCount++; } void HostAccessibleDeviceAllocator::DecRefCount() { std::lock_guard lock(mRefMutex); mLoadBalancerCount--; if (mLoadBalancerCount == 0) { shutdown(); } } void HostAccessibleDeviceAllocator::recordAllocation( void* devPtr, size_t memorySize, void* hostPtr, gdrcopy::GdrMemDesc* memDesc) { std::unique_lock lock(mAllocationsMutex); mDeviceAllocations[devPtr] = {memorySize, hostPtr, devPtr, memDesc}; mHostAllocations[hostPtr] = {memorySize, hostPtr, devPtr, memDesc}; } HostAccessibleDeviceAllocator::AllocationInfo HostAccessibleDeviceAllocator::getAllocationInfoFromHostPtr( void const* hostPtr) { std::shared_lock lock(mAllocationsMutex); if (mHostAllocations.empty()) { return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr}; } auto it = mHostAllocations.upper_bound(hostPtr); if (it == mHostAllocations.begin()) { return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr}; ; } --it; return it->second; } HostAccessibleDeviceAllocator::AllocationInfo HostAccessibleDeviceAllocator::getAllocationInfoFromDevPtr( void const* devPtr) { std::shared_lock lock(mAllocationsMutex); if (mDeviceAllocations.empty()) { return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr}; } auto it = mDeviceAllocations.upper_bound(devPtr); if (it == mDeviceAllocations.begin()) { return HostAccessibleDeviceAllocator::AllocationInfo{0, nullptr, nullptr, nullptr}; ; } --it; return it->second; } void* HostAccessibleDeviceAllocator::getHostPtr(void* devPtr) { auto allocationInfo = getAllocationInfoFromDevPtr(devPtr); if (allocationInfo.devPtr == nullptr) { return nullptr; } void* recordedDevPtr = allocationInfo.devPtr; size_t recordedSize = allocationInfo.size; void* recordedHostPtr = allocationInfo.hostPtr; auto pDev = static_cast(devPtr); auto pRecordedDev = static_cast(recordedDevPtr); if (pDev >= pRecordedDev && pDev < (pRecordedDev + recordedSize)) { ptrdiff_t offset = pDev - pRecordedDev; return static_cast(recordedHostPtr) + offset; } return nullptr; } void HostAccessibleDeviceAllocator::memcpyToDevice(void* dst, void const* src, size_t size) { if (mGdrHandle != nullptr) { auto allocationInfo = getAllocationInfoFromHostPtr(dst); TLLM_CHECK(allocationInfo.hostPtr != nullptr); TLLM_CHECK(allocationInfo.memDesc != nullptr); tensorrt_llm::runtime::gdrcopy::copy_to_mapping(allocationInfo.memDesc->gdrMh, dst, src, size); } else { memcpy(dst, src, size); } } void* HostAccessibleDeviceAllocator::allocate(size_t memorySize) { int currentDevId = -1; TLLM_CUDA_CHECK(cudaGetDevice(¤tDevId)); TLLM_CHECK_WITH_INFO(currentDevId == mDevId, "HostAccessibleDeviceAllocator is not initialized for the current device, currentDevId=%d, mDevId=%d", currentDevId, mDevId); TLLM_CHECK_WITH_INFO(isSupported(), "HostAccessibleDeviceAllocator is not supported on the current system."); void* devPtr = nullptr; void* hostPtr = nullptr; gdrcopy::GdrMemDesc* memDesc = nullptr; if (mGpuMemNumaId >= 0) { devPtr = TopologyDetector::getInstance().allocateCurrentGpuNumaMemory(memorySize); hostPtr = devPtr; } else if (mGdrHandle) { gdrcopy::gdrCudaMalloc(&hostPtr, &devPtr, memorySize, &memDesc, mGdrHandle); } else { TLLM_CHECK_WITH_INFO( mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system."); TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize)); TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId)); hostPtr = devPtr; } recordAllocation(devPtr, memorySize, hostPtr, memDesc); return devPtr; } void HostAccessibleDeviceAllocator::free(void* ptr) { std::unique_lock lock(mAllocationsMutex); auto it = mDeviceAllocations.find(ptr); if (it != mDeviceAllocations.end()) { auto const& allocInfo = it->second; if (allocInfo.memDesc) { gdrcopy::gdrCudaFree(allocInfo.memDesc, mGdrHandle); } else if (mGpuMemNumaId >= 0) { TopologyDetector::getInstance().freeCurrentGpuNumaMemory(const_cast(it->first), allocInfo.size); } else { TLLM_CHECK_WITH_INFO( mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system."); TLLM_CUDA_CHECK(cudaFree(ptr)); } void* hostPtr = it->second.hostPtr; TLLM_CHECK_WITH_INFO(mHostAllocations.count(hostPtr) == 1, "host pointer not recorded."); mDeviceAllocations.erase(it); mHostAllocations.erase(hostPtr); } else { TLLM_LOG_WARNING("Attempted to free a pointer that was not allocated by HostAccessibleDeviceAllocator."); } } } // namespace tensorrt_llm::runtime