[https://nvbugs/5887893][fix] Make NVML work with older CUDA driver versions (#11465)

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
This commit is contained in:
Iman Tabrizian 2026-02-12 15:06:47 -08:00 committed by GitHub
parent 5130cbd73e
commit dd74f90914
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 429 additions and 84 deletions

View File

@ -0,0 +1,184 @@
/*
* Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <dlfcn.h>
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/config.h"
#include "tensorrt_llm/common/logger.h"
#include "tensorrt_llm/common/nvmlWrapper.h"
#include <mutex>
TRTLLM_NAMESPACE_BEGIN
namespace common
{
std::shared_ptr<NVMLWrapper> NVMLWrapper::getInstance()
{
static std::mutex mutex;
static std::weak_ptr<NVMLWrapper> instance;
std::shared_ptr<NVMLWrapper> result = instance.lock();
if (result)
{
return result;
}
std::lock_guard<std::mutex> const lock(mutex);
result = instance.lock();
if (!result)
{
result = std::shared_ptr<NVMLWrapper>(new NVMLWrapper());
instance = result;
}
return result;
}
NVMLWrapper::NVMLWrapper()
: mHandle(dlopen("libnvidia-ml.so.1", RTLD_LAZY))
{
TLLM_CHECK_WITH_INFO(mHandle != nullptr, "NVML library (libnvidia-ml.so.1) could not be loaded.");
auto loadSym = [](void* handle, char const* name) -> void* { return dlsym(handle, name); };
auto loadRequired = [&](void* handle, char const* name) -> void*
{
void* sym = loadSym(handle, name);
TLLM_CHECK_WITH_INFO(sym != nullptr, "Required NVML symbol not found: %s", name);
return sym;
};
*reinterpret_cast<void**>(&_nvmlInit) = loadRequired(mHandle, "nvmlInit_v2");
*reinterpret_cast<void**>(&_nvmlShutdown) = loadRequired(mHandle, "nvmlShutdown");
*reinterpret_cast<void**>(&_nvmlDeviceGetHandleByIndex) = loadRequired(mHandle, "nvmlDeviceGetHandleByIndex_v2");
*reinterpret_cast<void**>(&_nvmlDeviceGetHandleByPciBusId)
= loadRequired(mHandle, "nvmlDeviceGetHandleByPciBusId_v2");
*reinterpret_cast<void**>(&_nvmlDeviceGetIndex) = loadRequired(mHandle, "nvmlDeviceGetIndex");
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkRemotePciInfo)
= loadRequired(mHandle, "nvmlDeviceGetNvLinkRemotePciInfo_v2");
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkCapability) = loadRequired(mHandle, "nvmlDeviceGetNvLinkCapability");
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkState) = loadRequired(mHandle, "nvmlDeviceGetNvLinkState");
*reinterpret_cast<void**>(&_nvmlErrorString) = loadRequired(mHandle, "nvmlErrorString");
*reinterpret_cast<void**>(&_nvmlDeviceGetComputeRunningProcesses)
= loadRequired(mHandle, "nvmlDeviceGetComputeRunningProcesses_v3");
// Optional symbols - nullptr is OK (older drivers may not have these)
*reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfoV) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfoV");
*reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfo) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfo");
if (!_nvmlDeviceGetGpuFabricInfoV)
{
TLLM_LOG_INFO(
"NVML symbol nvmlDeviceGetGpuFabricInfoV not available (older driver). MNNVL fabric detection will use "
"legacy API or be disabled.");
}
if (!_nvmlDeviceGetGpuFabricInfo)
{
TLLM_LOG_INFO("NVML symbol nvmlDeviceGetGpuFabricInfo not available.");
}
}
NVMLWrapper::~NVMLWrapper()
{
dlclose(mHandle);
}
nvmlReturn_t NVMLWrapper::nvmlInit() const
{
return (*_nvmlInit)();
}
nvmlReturn_t NVMLWrapper::nvmlShutdown() const
{
return (*_nvmlShutdown)();
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const
{
return (*_nvmlDeviceGetHandleByIndex)(index, device);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const
{
return (*_nvmlDeviceGetHandleByPciBusId)(pciBusId, device);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const
{
return (*_nvmlDeviceGetIndex)(device, index);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkRemotePciInfo(
nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const
{
return (*_nvmlDeviceGetNvLinkRemotePciInfo)(device, link, pci);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkCapability(
nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const
{
return (*_nvmlDeviceGetNvLinkCapability)(device, link, capability, capResult);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkState(
nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const
{
return (*_nvmlDeviceGetNvLinkState)(device, link, isActive);
}
char const* NVMLWrapper::nvmlErrorString(nvmlReturn_t result) const
{
return (*_nvmlErrorString)(result);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const
{
if (!_nvmlDeviceGetGpuFabricInfoV)
{
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return (*_nvmlDeviceGetGpuFabricInfoV)(device, gpuFabricInfo);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const
{
if (!_nvmlDeviceGetGpuFabricInfo)
{
return NVML_ERROR_FUNCTION_NOT_FOUND;
}
return (*_nvmlDeviceGetGpuFabricInfo)(device, gpuFabricInfo);
}
nvmlReturn_t NVMLWrapper::nvmlDeviceGetComputeRunningProcesses(
nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const
{
return (*_nvmlDeviceGetComputeRunningProcesses)(device, infoCount, infos);
}
bool NVMLWrapper::hasGpuFabricInfoV() const
{
return _nvmlDeviceGetGpuFabricInfoV != nullptr;
}
bool NVMLWrapper::hasGpuFabricInfo() const
{
return _nvmlDeviceGetGpuFabricInfo != nullptr;
}
} // namespace common
TRTLLM_NAMESPACE_END

View File

@ -0,0 +1,123 @@
/*
* Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef NVML_WRAPPER_H
#define NVML_WRAPPER_H
#include "tensorrt_llm/common/config.h"
#include <nvml.h>
#include <memory>
TRTLLM_NAMESPACE_BEGIN
namespace common
{
class NVMLWrapper
{
public:
static std::shared_ptr<NVMLWrapper> getInstance();
~NVMLWrapper();
NVMLWrapper(NVMLWrapper const&) = delete;
NVMLWrapper& operator=(NVMLWrapper const&) = delete;
NVMLWrapper(NVMLWrapper&&) = delete;
NVMLWrapper& operator=(NVMLWrapper&&) = delete;
// Required NVML functions
nvmlReturn_t nvmlInit() const;
nvmlReturn_t nvmlShutdown() const;
nvmlReturn_t nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const;
nvmlReturn_t nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const;
nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const;
nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const;
nvmlReturn_t nvmlDeviceGetNvLinkCapability(
nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const;
nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const;
char const* nvmlErrorString(nvmlReturn_t result) const;
nvmlReturn_t nvmlDeviceGetComputeRunningProcesses(
nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const;
// Optional NVML functions (may be nullptr on older drivers)
nvmlReturn_t nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const;
nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const;
// Runtime availability checks
bool hasGpuFabricInfoV() const;
bool hasGpuFabricInfo() const;
private:
void* mHandle;
NVMLWrapper();
// Required function pointers
nvmlReturn_t (*_nvmlInit)();
nvmlReturn_t (*_nvmlShutdown)();
nvmlReturn_t (*_nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t*);
nvmlReturn_t (*_nvmlDeviceGetHandleByPciBusId)(char const*, nvmlDevice_t*);
nvmlReturn_t (*_nvmlDeviceGetIndex)(nvmlDevice_t, unsigned int*);
nvmlReturn_t (*_nvmlDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t, unsigned int, nvmlPciInfo_t*);
nvmlReturn_t (*_nvmlDeviceGetNvLinkCapability)(nvmlDevice_t, unsigned int, nvmlNvLinkCapability_t, unsigned int*);
nvmlReturn_t (*_nvmlDeviceGetNvLinkState)(nvmlDevice_t, unsigned int, nvmlEnableState_t*);
char const* (*_nvmlErrorString)(nvmlReturn_t);
nvmlReturn_t (*_nvmlDeviceGetComputeRunningProcesses)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*);
// Optional function pointers (may be nullptr)
nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfoV)(nvmlDevice_t, nvmlGpuFabricInfoV_t*);
nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfo)(nvmlDevice_t, nvmlGpuFabricInfo_t*);
};
// RAII class that initializes NVML on construction and shuts down on destruction.
// Replaces duplicated NvmlManager classes in allreduceOp.cpp and allreducePlugin.cpp.
class NvmlManager
{
public:
NvmlManager()
: mNvml(NVMLWrapper::getInstance())
{
auto result = mNvml->nvmlInit();
if (result != NVML_SUCCESS)
{
TLLM_THROW("Failed to initialize NVML: %s", mNvml->nvmlErrorString(result));
}
}
~NvmlManager()
{
mNvml->nvmlShutdown();
}
NVMLWrapper const& wrapper() const
{
return *mNvml;
}
std::shared_ptr<NVMLWrapper> const& sharedWrapper() const
{
return mNvml;
}
private:
std::shared_ptr<NVMLWrapper> mNvml;
};
} // namespace common
TRTLLM_NAMESPACE_END
#endif // NVML_WRAPPER_H

View File

@ -38,6 +38,8 @@
#include <string>
#include <unordered_map>
#include "tensorrt_llm/common/nvmlWrapper.h"
TRTLLM_NAMESPACE_BEGIN
namespace common::op
@ -319,7 +321,8 @@ TRTLLM_NAMESPACE_END
nvmlReturn_t r = cmd; \
if (r != NVML_SUCCESS) \
{ \
printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \
printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, \
tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while (0)
@ -330,6 +333,7 @@ TRTLLM_NAMESPACE_END
nvmlReturn_t r = cmd; \
if (TLLM_UNLIKELY(r != NVML_SUCCESS)) \
{ \
TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \
TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, \
tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r)); \
} \
} while (0)

View File

@ -53,7 +53,6 @@ target_link_libraries(
${TORCH_LIBRARIES}
torch_python
${CUDA_DRV_LIB}
${CUDA_NVML_LIB}
th_common
pg_utils)
target_compile_definitions(

View File

@ -170,7 +170,6 @@ target_link_libraries(
${CUBLASLT_LIB}
${TRT_LIB}
${CUDA_DRV_LIB}
${CUDA_NVML_LIB}
${CUDA_RT_LIB}
${CMAKE_DL_LIBS}
${SHARED_TARGET})

View File

@ -19,6 +19,7 @@
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/customAllReduceUtils.h"
#include "tensorrt_llm/common/dataType.h"
#include "tensorrt_llm/common/nvmlWrapper.h"
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
#include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
@ -601,19 +602,8 @@ bool AllreducePlugin::isCustomAllReduceSupported(int ranks_per_node) const noexc
&& (static_cast<size_t>(ranks_per_node) <= kernels::MAX_RANKS_PER_NODE) && (ranks_per_node > 0);
}
class NvmlManager
{
public:
NvmlManager()
{
NVML_CHECK(nvmlInit());
}
~NvmlManager()
{
NVML_CHECK(nvmlShutdown());
}
};
using tensorrt_llm::common::NvmlManager;
using tensorrt_llm::common::NVMLWrapper;
std::set<int> getLocalGroup(std::set<int> const& group)
{
@ -711,6 +701,7 @@ void AllreducePlugin::setGroupTopology() noexcept
TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
NvmlManager nvmlManager;
auto const& nvml = nvmlManager.sharedWrapper();
std::unordered_set<int> visitedDevice;
mIsP2PSupported = true;
mIsNVLINKSupported = true;
@ -738,26 +729,26 @@ void AllreducePlugin::setGroupTopology() noexcept
}
nvmlDevice_t firstDevice;
NVML_CHECK(nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));
NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));
bool isNVLINK = false;
for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
{
nvmlPciInfo_t remotePciInfo;
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
{
continue;
}
nvmlDevice_t remoteDevice;
auto const result = nvmlDeviceGetHandleByPciBusId_v2(remotePciInfo.busId, &remoteDevice);
auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remotePciInfo.busId, &remoteDevice);
if (result == NVML_SUCCESS)
{
// Two GPUs are connected directly through nvlink
unsigned int remoteDeviceId;
NVML_CHECK(nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));
NVML_CHECK(nvml->nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));
if (remoteDeviceId == static_cast<unsigned int>(secondDeviceId))
{
@ -770,12 +761,12 @@ void AllreducePlugin::setGroupTopology() noexcept
// now remotePciInfo represents the pci information of nvswitch,
// determine whether nvlink is supported by whether two GPUs are connected to the same nvswitch.
nvmlDevice_t secondDevice;
NVML_CHECK(nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));
NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));
for (unsigned int secondLink = 0; secondLink < NVML_NVLINK_MAX_LINKS; secondLink++)
{
nvmlPciInfo_t secondRemotePciInfo;
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(secondDevice, secondLink, &secondRemotePciInfo)
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(secondDevice, secondLink, &secondRemotePciInfo)
!= NVML_SUCCESS)
{
continue;

View File

@ -81,7 +81,6 @@ set_property(TARGET runtime_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_cuda_architectures(runtime_src 89)
target_include_directories(runtime_src PRIVATE ${MPI_C_INCLUDE_DIRS})
target_link_libraries(runtime_src PUBLIC ${CUDA_NVML_LIB})
if(ENABLE_MULTI_DEVICE)
target_link_libraries(runtime_src PUBLIC ${NCCL_LIB})

View File

@ -23,7 +23,7 @@
#include <nvshmem/nvshmemx.h>
#endif
#if ENABLE_MULTI_DEVICE
#include <nvml.h>
#include "tensorrt_llm/common/nvmlWrapper.h"
#endif
#include <unistd.h>
@ -46,7 +46,8 @@
nvmlReturn_t retval = cmd; \
if (retval != NVML_SUCCESS) \
{ \
printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(retval)); \
printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, \
tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(retval)); \
exit(EXIT_FAILURE); \
} \
} while (0)
@ -329,18 +330,41 @@ private:
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
}
auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
tensorrt_llm::common::NvmlManager nvmlManager;
nvmlDevice_t nvml_device;
nvmlGpuFabricInfo_t fabric_info;
NVMLCHECK(nvmlInit_v2());
NVMLCHECK(nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
NVMLCHECK(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
NVMLCHECK(nvmlShutdown());
NVMLCHECK(nvml->nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
nvmlGpuFabricState_t fabric_state;
nvmlReturn_t fabric_status;
if (nvml->hasGpuFabricInfoV())
{
nvmlGpuFabricInfoV_t fabric_info_v;
memset(&fabric_info_v, 0, sizeof(fabric_info_v));
fabric_info_v.version = nvmlGpuFabricInfo_v2;
NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfoV(nvml_device, &fabric_info_v));
fabric_state = fabric_info_v.state;
fabric_status = fabric_info_v.status;
}
else if (nvml->hasGpuFabricInfo())
{
nvmlGpuFabricInfo_t fabric_info;
NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
fabric_state = fabric_info.state;
fabric_status = fabric_info.status;
}
else
{
TLLM_LOG_TRACE("checking fabric support... NVML fabric info APIs not available.");
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
}
// Check if the fabric is fully initialized.
if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS)
if (fabric_state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_status != NVML_SUCCESS)
{
TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.",
fabric_info.state, fabric_info.status);
TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.", fabric_state,
fabric_status);
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
}
@ -381,8 +405,7 @@ private:
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
}
TLLM_LOG_TRACE("fabric status: state=%u status=%u clique=%u", device_id, fabric_info.state, fabric_info.status,
fabric_info.cliqueId);
TLLM_LOG_TRACE("fabric status: state=%u status=%u", device_id, fabric_state, fabric_status);
CUCHECK(cuMemRelease(handle));
// If we get here, fabric handles are supported.

View File

@ -134,8 +134,7 @@ endif()
if(ENABLE_MULTI_DEVICE)
target_include_directories(th_common PUBLIC ${MPI_C_INCLUDE_DIRS})
target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB}
CUDA::nvml)
target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB})
endif()
if(NOT WIN32)

View File

@ -21,6 +21,7 @@
#include "tensorrt_llm/common/dataType.h"
#include "tensorrt_llm/common/mcastDevMemUtils.h"
#include "tensorrt_llm/common/ncclUtils.h"
#include "tensorrt_llm/common/nvmlWrapper.h"
#include "tensorrt_llm/common/opUtils.h"
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
#include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
@ -85,19 +86,8 @@ struct overloaded : Ts...
template <class... Ts>
overloaded(Ts...) -> overloaded<Ts...>;
class NvmlManager
{
public:
NvmlManager()
{
NVML_CHECK_THROW(nvmlInit());
}
~NvmlManager()
{
NVML_CHECK(nvmlShutdown());
}
};
using tensorrt_llm::common::NvmlManager;
using tensorrt_llm::common::NVMLWrapper;
std::set<int> getLocalGroup(std::set<int> const& group)
{
@ -965,7 +955,7 @@ private:
MNNVLFabricInfo info;
#if ENABLE_MULTI_DEVICE
// 1. Check CUDA driver version (needs >= 12.0.10)
// Check CUDA driver version (needs >= 12.0.10)
int cudaDriverVersion = -1;
TLLM_CUDA_CHECK(cudaDriverGetVersion(&cudaDriverVersion));
if (cudaDriverVersion < 12010)
@ -974,7 +964,7 @@ private:
return info;
}
// 2. Check multicast support
// Check multicast support
CUdevice cuDevice;
TLLM_CU_CHECK(cuDeviceGet(&cuDevice, deviceId));
auto cudaDriver = tensorrt_llm::common::CUDADriverWrapper::getInstance();
@ -988,7 +978,7 @@ private:
return info;
}
// 3. Check fabric handle support
// Check fabric handle support
int fabricHandleSupported = 0;
TLLM_CU_CHECK(cudaDriver->cuDeviceGetAttribute(
&fabricHandleSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDevice));
@ -998,9 +988,10 @@ private:
return info;
}
// 4. Check NVML GPU Fabric Info using versioned API
// Check NVML GPU Fabric Info using versioned API (runtime dispatch)
auto nvml = NVMLWrapper::getInstance();
nvmlDevice_t nvmlDevice;
nvmlReturn_t nvmlResult = nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
nvmlReturn_t nvmlResult = nvml->nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
if (nvmlResult != NVML_SUCCESS)
{
TLLM_LOG_DEBUG("MNNVL check: Failed to get NVML device handle for device %d - error=%d", deviceId,
@ -1008,24 +999,48 @@ private:
return info;
}
nvmlGpuFabricInfoV_t fabricInfoV;
std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
fabricInfoV.version = NVML_STRUCT_VERSION(GpuFabricInfo, 3);
nvmlResult = nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
nvmlGpuFabricState_t fabricState;
nvmlReturn_t fabricStatus;
unsigned char fabricClusterUuid[NVML_GPU_FABRIC_UUID_LEN];
unsigned int fabricCliqueId;
if (nvml->hasGpuFabricInfoV())
{
nvmlGpuFabricInfoV_t fabricInfoV;
std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
fabricInfoV.version = nvmlGpuFabricInfo_v2;
nvmlResult = nvml->nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
fabricState = fabricInfoV.state;
fabricStatus = fabricInfoV.status;
std::memcpy(fabricClusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
fabricCliqueId = fabricInfoV.cliqueId;
}
else if (nvml->hasGpuFabricInfo())
{
nvmlGpuFabricInfo_t fabricInfoLegacy;
nvmlResult = nvml->nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfoLegacy);
fabricState = fabricInfoLegacy.state;
fabricStatus = fabricInfoLegacy.status;
std::memcpy(fabricClusterUuid, fabricInfoLegacy.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
fabricCliqueId = fabricInfoLegacy.cliqueId;
}
else
{
TLLM_LOG_DEBUG("MNNVL check: Neither nvmlDeviceGetGpuFabricInfoV nor nvmlDeviceGetGpuFabricInfo available");
return info;
}
if (nvmlResult != NVML_SUCCESS)
{
TLLM_LOG_DEBUG(
"MNNVL check: nvmlDeviceGetGpuFabricInfoV failed for device %d - error=%d (not supported or "
"MNNVL check: nvmlDeviceGetGpuFabricInfo failed for device %d - error=%d (not supported or "
"no fabric manager)",
deviceId, static_cast<int>(nvmlResult));
return info;
}
// Check if fabric is fully initialized
if (fabricInfoV.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfoV.status != NVML_SUCCESS)
if (fabricState != NVML_GPU_FABRIC_STATE_COMPLETED || fabricStatus != NVML_SUCCESS)
{
TLLM_LOG_DEBUG(
"MNNVL check: Fabric state not complete - state=%u status=%u", fabricInfoV.state, fabricInfoV.status);
TLLM_LOG_DEBUG("MNNVL check: Fabric state not complete - state=%u status=%u", fabricState, fabricStatus);
return info;
}
@ -1034,7 +1049,7 @@ private:
bool clusterUuidValid = false;
for (int i = 0; i < NVML_GPU_FABRIC_UUID_LEN; ++i)
{
if (fabricInfoV.clusterUuid[i] != 0)
if (fabricClusterUuid[i] != 0)
{
clusterUuidValid = true;
break;
@ -1047,7 +1062,7 @@ private:
return info;
}
// 5. Check NVLink links are active (similar to Python support_nvlink(True))
// Check NVLink links are active (similar to Python support_nvlink(True))
unsigned int activeLinks = 0;
unsigned int availableLinks = 0;
@ -1055,12 +1070,12 @@ private:
{
unsigned int capP2p = 0;
nvmlReturn_t capResult
= nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
= nvml->nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
if (capResult == NVML_SUCCESS && capP2p)
{
availableLinks++;
nvmlEnableState_t linkState;
if (nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
if (nvml->nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
&& linkState == NVML_FEATURE_ENABLED)
{
activeLinks++;
@ -1077,12 +1092,12 @@ private:
}
// Device supports MNNVL - copy fabric info
std::memcpy(info.clusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
info.cliqueId = fabricInfoV.cliqueId;
std::memcpy(info.clusterUuid, fabricClusterUuid, NVML_GPU_FABRIC_UUID_LEN);
info.cliqueId = fabricCliqueId;
info.isValid = true;
TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (clusterUuid=%s, cliqueId=%u)", deviceId,
info.getClusterUuidString().c_str(), fabricInfoV.cliqueId);
info.getClusterUuidString().c_str(), fabricCliqueId);
#endif
return info;
}
@ -1104,6 +1119,7 @@ private:
bool is_inter_node = (mGroup.size() != local_group.size());
NvmlManager nvml_manager;
auto const& nvml = nvml_manager.sharedWrapper();
mIsP2PSupported = true;
mIsNVLINKSupported = true;
mIsMNNVLSupported = false;
@ -1134,26 +1150,27 @@ private:
}
nvmlDevice_t first_device;
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
bool is_NVLINK = false;
for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
{
nvmlPciInfo_t remote_pci_info;
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(first_device, link, &remote_pci_info)
!= NVML_SUCCESS)
{
continue;
}
nvmlDevice_t remote_device;
auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remote_pci_info.busId, &remote_device);
if (result == NVML_SUCCESS)
{
// Two GPUs are connected directly through nvlink
unsigned int remote_device_id;
NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
NVML_CHECK_THROW(nvml->nvmlDeviceGetIndex(remote_device, &remote_device_id));
if (remote_device_id == static_cast<unsigned int>(second_device_id))
{
@ -1167,12 +1184,12 @@ private:
// determine whether nvlink is supported by whether two GPUs are connected to the same
// nvswitch.
nvmlDevice_t second_device;
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
{
nvmlPciInfo_t second_remote_pci_info;
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(
second_device, second_link, &second_remote_pci_info)
!= NVML_SUCCESS)
{

View File

@ -18,6 +18,7 @@
#include <nvml.h>
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/nvmlWrapper.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/tllmBuffers.h"
#include "tensorrt_llm/runtime/virtualMemory.h"
@ -57,9 +58,11 @@ protected:
TLLM_CU_CHECK(cuDevicePrimaryCtxRetain(&ctx, dev));
TLLM_CU_CHECK(cuCtxSetCurrent(ctx));
// Initialize NVML
nvmlReturn_t nvmlResult = nvmlInit();
TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", nvmlErrorString(nvmlResult));
// Initialize NVML via wrapper
mNvml = tensorrt_llm::common::NVMLWrapper::getInstance();
nvmlReturn_t nvmlResult = mNvml->nvmlInit();
TLLM_CHECK_WITH_INFO(
nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", mNvml->nvmlErrorString(nvmlResult));
if (!memoryInfoAvailable())
{
@ -88,14 +91,16 @@ protected:
static size_t getCurrentProcessMemoryInfo()
{
auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
// Get current process ID
uint32_t currentPid = static_cast<uint32_t>(getpid());
// Get device handle for GPU 0
nvmlDevice_t device;
auto nvmlResult = nvmlDeviceGetHandleByIndex(0, &device);
auto nvmlResult = nvml->nvmlDeviceGetHandleByIndex(0, &device);
TLLM_CHECK_WITH_INFO(
nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvmlErrorString(nvmlResult));
nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvml->nvmlErrorString(nvmlResult));
// Get running processes
unsigned int processCount = 1;
@ -103,9 +108,9 @@ protected:
nvmlResult = NVML_ERROR_INSUFFICIENT_SIZE;
while (nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE)
{
nvmlResult = nvmlDeviceGetComputeRunningProcesses_v3(device, &processCount, processes.data());
nvmlResult = nvml->nvmlDeviceGetComputeRunningProcesses(device, &processCount, processes.data());
TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS || nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE,
"Failed to get process count: %s", nvmlErrorString(nvmlResult));
"Failed to get process count: %s", nvml->nvmlErrorString(nvmlResult));
processes.resize(processCount);
}
@ -120,6 +125,8 @@ protected:
return 0;
}
std::shared_ptr<tensorrt_llm::common::NVMLWrapper> mNvml;
};
class VirtualMemoryTest : public VirtualMemoryTestBase