TensorRT-LLMs/cpp/tensorrt_llm/common/nvmlWrapper.cpp
Iman Tabrizian dd74f90914
[https://nvbugs/5887893][fix] Make NVML work with older CUDA driver versions (#11465)
Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
2026-02-12 18:06:47 -05:00

185 lines
6.0 KiB
C++

/*
* Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <dlfcn.h>
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/nvmlWrapper.h"
#include <mutex>
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::shared_ptr<NVMLWrapper> NVMLWrapper::getInstance()
{
static std::mutex mutex;
static std::weak_ptr<NVMLWrapper> instance;
std::shared_ptr<NVMLWrapper> result = instance.lock();
if (result)
{
return result;
}
std::lock_guard<std::mutex> const lock(mutex);
result = instance.lock();
if (!result)
{
result = std::shared_ptr<NVMLWrapper>(new NVMLWrapper());
instance = result;
}
return result;
}
NVMLWrapper::NVMLWrapper()
: mHandle(dlopen("libnvidia-ml.so.1", RTLD_LAZY))
{
TLLM_CHECK_WITH_INFO(mHandle != nullptr, "NVML library (libnvidia-ml.so.1) could not be loaded.");
auto loadSym = [](void* handle, char const* name) -> void* { return dlsym(handle, name); };
auto loadRequired = [&](void* handle, char const* name) -> void*
{
void* sym = loadSym(handle, name);
TLLM_CHECK_WITH_INFO(sym != nullptr, "Required NVML symbol not found: %s", name);
return sym;
};
*reinterpret_cast<void**>(&_nvmlInit) = loadRequired(mHandle, "nvmlInit_v2");
*reinterpret_cast<void**>(&_nvmlShutdown) = loadRequired(mHandle, "nvmlShutdown");
*reinterpret_cast<void**>(&_nvmlDeviceGetHandleByIndex) = loadRequired(mHandle, "nvmlDeviceGetHandleByIndex_v2");
*reinterpret_cast<void**>(&_nvmlDeviceGetHandleByPciBusId)
= loadRequired(mHandle, "nvmlDeviceGetHandleByPciBusId_v2");
*reinterpret_cast<void**>(&_nvmlDeviceGetIndex) = loadRequired(mHandle, "nvmlDeviceGetIndex");
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkRemotePciInfo)
= loadRequired(mHandle, "nvmlDeviceGetNvLinkRemotePciInfo_v2");
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkCapability) = loadRequired(mHandle, "nvmlDeviceGetNvLinkCapability");
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkState) = loadRequired(mHandle, "nvmlDeviceGetNvLinkState");
*reinterpret_cast<void**>(&_nvmlErrorString) = loadRequired(mHandle, "nvmlErrorString");
*reinterpret_cast<void**>(&_nvmlDeviceGetComputeRunningProcesses)
= loadRequired(mHandle, "nvmlDeviceGetComputeRunningProcesses_v3");
// Optional symbols - nullptr is OK (older drivers may not have these)
*reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfoV) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfoV");
*reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfo) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfo");
if (!_nvmlDeviceGetGpuFabricInfoV)
{
TLLM_LOG_INFO(
"NVML symbol nvmlDeviceGetGpuFabricInfoV not available (older driver). MNNVL fabric detection will use "
"legacy API or be disabled.");
}
if (!_nvmlDeviceGetGpuFabricInfo)
{
TLLM_LOG_INFO("NVML symbol nvmlDeviceGetGpuFabricInfo not available.");
}
}
NVMLWrapper::~NVMLWrapper()
{
dlclose(mHandle);
}
nvmlReturn_t NVMLWrapper::nvmlInit() const
{
return (*_nvmlInit)();
}
nvmlReturn_t NVMLWrapper::nvmlShutdown() const
{
return (*_nvmlShutdown)();
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const
{
return (*_nvmlDeviceGetHandleByIndex)(index, device);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const
{
return (*_nvmlDeviceGetHandleByPciBusId)(pciBusId, device);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const
{
return (*_nvmlDeviceGetIndex)(device, index);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkRemotePciInfo(
nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const
{
return (*_nvmlDeviceGetNvLinkRemotePciInfo)(device, link, pci);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkCapability(
nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const
{
return (*_nvmlDeviceGetNvLinkCapability)(device, link, capability, capResult);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkState(
nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const
{
return (*_nvmlDeviceGetNvLinkState)(device, link, isActive);
}
char const* NVMLWrapper::nvmlErrorString(nvmlReturn_t result) const
{
return (*_nvmlErrorString)(result);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const
{
if (!_nvmlDeviceGetGpuFabricInfoV)
{
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return (*_nvmlDeviceGetGpuFabricInfoV)(device, gpuFabricInfo);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const
{
if (!_nvmlDeviceGetGpuFabricInfo)
{
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return (*_nvmlDeviceGetGpuFabricInfo)(device, gpuFabricInfo);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetComputeRunningProcesses(
nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const
{
return (*_nvmlDeviceGetComputeRunningProcesses)(device, infoCount, infos);
}
bool NVMLWrapper::hasGpuFabricInfoV() const
{
return _nvmlDeviceGetGpuFabricInfoV != nullptr;
}
bool NVMLWrapper::hasGpuFabricInfo() const
{
return _nvmlDeviceGetGpuFabricInfo != nullptr;
}
} // namespace common
TRTLLM_NAMESPACE_END