[https://nvbugs/5887893][fix] Make NVML work with older CUDA driver versions (#11465)

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
2026-02-16 07:53:55 +08:00 · 2026-02-12 15:06:47 -08:00 · 2026-02-12 15:06:47 -08:00 · dd74f90914
commit dd74f90914
parent 5130cbd73e
11 changed files with 429 additions and 84 deletions
--- a/cpp/tensorrt_llm/common/nvmlWrapper.cpp
+++ b/cpp/tensorrt_llm/common/nvmlWrapper.cpp
@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dlfcn.h>
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
+
+#include <mutex>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+
+std::shared_ptr<NVMLWrapper> NVMLWrapper::getInstance()
+{
+    static std::mutex mutex;
+    static std::weak_ptr<NVMLWrapper> instance;
+    std::shared_ptr<NVMLWrapper> result = instance.lock();
+    if (result)
+    {
+        return result;
+    }
+
+    std::lock_guard<std::mutex> const lock(mutex);
+    result = instance.lock();
+    if (!result)
+    {
+        result = std::shared_ptr<NVMLWrapper>(new NVMLWrapper());
+        instance = result;
+    }
+    return result;
+}
+
+NVMLWrapper::NVMLWrapper()
+    : mHandle(dlopen("libnvidia-ml.so.1", RTLD_LAZY))
+{
+    TLLM_CHECK_WITH_INFO(mHandle != nullptr, "NVML library (libnvidia-ml.so.1) could not be loaded.");
+
+    auto loadSym = [](void* handle, char const* name) -> void* { return dlsym(handle, name); };
+
+    auto loadRequired = [&](void* handle, char const* name) -> void*
+    {
+        void* sym = loadSym(handle, name);
+        TLLM_CHECK_WITH_INFO(sym != nullptr, "Required NVML symbol not found: %s", name);
+        return sym;
+    };
+
+    *reinterpret_cast<void**>(&_nvmlInit) = loadRequired(mHandle, "nvmlInit_v2");
+    *reinterpret_cast<void**>(&_nvmlShutdown) = loadRequired(mHandle, "nvmlShutdown");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetHandleByIndex) = loadRequired(mHandle, "nvmlDeviceGetHandleByIndex_v2");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetHandleByPciBusId)
+        = loadRequired(mHandle, "nvmlDeviceGetHandleByPciBusId_v2");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetIndex) = loadRequired(mHandle, "nvmlDeviceGetIndex");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkRemotePciInfo)
+        = loadRequired(mHandle, "nvmlDeviceGetNvLinkRemotePciInfo_v2");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkCapability) = loadRequired(mHandle, "nvmlDeviceGetNvLinkCapability");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkState) = loadRequired(mHandle, "nvmlDeviceGetNvLinkState");
+    *reinterpret_cast<void**>(&_nvmlErrorString) = loadRequired(mHandle, "nvmlErrorString");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetComputeRunningProcesses)
+        = loadRequired(mHandle, "nvmlDeviceGetComputeRunningProcesses_v3");
+
+    // Optional symbols - nullptr is OK (older drivers may not have these)
+    *reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfoV) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfoV");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfo) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfo");
+
+    if (!_nvmlDeviceGetGpuFabricInfoV)
+    {
+        TLLM_LOG_INFO(
+            "NVML symbol nvmlDeviceGetGpuFabricInfoV not available (older driver). MNNVL fabric detection will use "
+            "legacy API or be disabled.");
+    }
+    if (!_nvmlDeviceGetGpuFabricInfo)
+    {
+        TLLM_LOG_INFO("NVML symbol nvmlDeviceGetGpuFabricInfo not available.");
+    }
+}
+
+NVMLWrapper::~NVMLWrapper()
+{
+    dlclose(mHandle);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlInit() const
+{
+    return (*_nvmlInit)();
+}
+
+nvmlReturn_t NVMLWrapper::nvmlShutdown() const
+{
+    return (*_nvmlShutdown)();
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const
+{
+    return (*_nvmlDeviceGetHandleByIndex)(index, device);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const
+{
+    return (*_nvmlDeviceGetHandleByPciBusId)(pciBusId, device);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const
+{
+    return (*_nvmlDeviceGetIndex)(device, index);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkRemotePciInfo(
+    nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const
+{
+    return (*_nvmlDeviceGetNvLinkRemotePciInfo)(device, link, pci);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkCapability(
+    nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const
+{
+    return (*_nvmlDeviceGetNvLinkCapability)(device, link, capability, capResult);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkState(
+    nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const
+{
+    return (*_nvmlDeviceGetNvLinkState)(device, link, isActive);
+}
+
+char const* NVMLWrapper::nvmlErrorString(nvmlReturn_t result) const
+{
+    return (*_nvmlErrorString)(result);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const
+{
+    if (!_nvmlDeviceGetGpuFabricInfoV)
+    {
+        return NVML_ERROR_FUNCTION_NOT_FOUND;
+    }
+    return (*_nvmlDeviceGetGpuFabricInfoV)(device, gpuFabricInfo);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const
+{
+    if (!_nvmlDeviceGetGpuFabricInfo)
+    {
+        return NVML_ERROR_FUNCTION_NOT_FOUND;
+    }
+    return (*_nvmlDeviceGetGpuFabricInfo)(device, gpuFabricInfo);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetComputeRunningProcesses(
+    nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const
+{
+    return (*_nvmlDeviceGetComputeRunningProcesses)(device, infoCount, infos);
+}
+
+bool NVMLWrapper::hasGpuFabricInfoV() const
+{
+    return _nvmlDeviceGetGpuFabricInfoV != nullptr;
+}
+
+bool NVMLWrapper::hasGpuFabricInfo() const
+{
+    return _nvmlDeviceGetGpuFabricInfo != nullptr;
+}
+
+} // namespace common
+
+TRTLLM_NAMESPACE_END
--- a/cpp/tensorrt_llm/common/nvmlWrapper.h
+++ b/cpp/tensorrt_llm/common/nvmlWrapper.h
@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVML_WRAPPER_H
+#define NVML_WRAPPER_H
+
+#include "tensorrt_llm/common/config.h"
+
+#include <nvml.h>
+
+#include <memory>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+
+class NVMLWrapper
+{
+public:
+    static std::shared_ptr<NVMLWrapper> getInstance();
+
+    ~NVMLWrapper();
+    NVMLWrapper(NVMLWrapper const&) = delete;
+    NVMLWrapper& operator=(NVMLWrapper const&) = delete;
+    NVMLWrapper(NVMLWrapper&&) = delete;
+    NVMLWrapper& operator=(NVMLWrapper&&) = delete;
+
+    // Required NVML functions
+    nvmlReturn_t nvmlInit() const;
+    nvmlReturn_t nvmlShutdown() const;
+    nvmlReturn_t nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const;
+    nvmlReturn_t nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const;
+    nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const;
+    nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const;
+    nvmlReturn_t nvmlDeviceGetNvLinkCapability(
+        nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const;
+    nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const;
+    char const* nvmlErrorString(nvmlReturn_t result) const;
+    nvmlReturn_t nvmlDeviceGetComputeRunningProcesses(
+        nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const;
+
+    // Optional NVML functions (may be nullptr on older drivers)
+    nvmlReturn_t nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const;
+    nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const;
+
+    // Runtime availability checks
+    bool hasGpuFabricInfoV() const;
+    bool hasGpuFabricInfo() const;
+
+private:
+    void* mHandle;
+    NVMLWrapper();
+
+    // Required function pointers
+    nvmlReturn_t (*_nvmlInit)();
+    nvmlReturn_t (*_nvmlShutdown)();
+    nvmlReturn_t (*_nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t*);
+    nvmlReturn_t (*_nvmlDeviceGetHandleByPciBusId)(char const*, nvmlDevice_t*);
+    nvmlReturn_t (*_nvmlDeviceGetIndex)(nvmlDevice_t, unsigned int*);
+    nvmlReturn_t (*_nvmlDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t, unsigned int, nvmlPciInfo_t*);
+    nvmlReturn_t (*_nvmlDeviceGetNvLinkCapability)(nvmlDevice_t, unsigned int, nvmlNvLinkCapability_t, unsigned int*);
+    nvmlReturn_t (*_nvmlDeviceGetNvLinkState)(nvmlDevice_t, unsigned int, nvmlEnableState_t*);
+    char const* (*_nvmlErrorString)(nvmlReturn_t);
+    nvmlReturn_t (*_nvmlDeviceGetComputeRunningProcesses)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*);
+
+    // Optional function pointers (may be nullptr)
+    nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfoV)(nvmlDevice_t, nvmlGpuFabricInfoV_t*);
+    nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfo)(nvmlDevice_t, nvmlGpuFabricInfo_t*);
+};
+
+// RAII class that initializes NVML on construction and shuts down on destruction.
+// Replaces duplicated NvmlManager classes in allreduceOp.cpp and allreducePlugin.cpp.
+class NvmlManager
+{
+public:
+    NvmlManager()
+        : mNvml(NVMLWrapper::getInstance())
+    {
+        auto result = mNvml->nvmlInit();
+        if (result != NVML_SUCCESS)
+        {
+            TLLM_THROW("Failed to initialize NVML: %s", mNvml->nvmlErrorString(result));
+        }
+    }
+
+    ~NvmlManager()
+    {
+        mNvml->nvmlShutdown();
+    }
+
+    NVMLWrapper const& wrapper() const
+    {
+        return *mNvml;
+    }
+
+    std::shared_ptr<NVMLWrapper> const& sharedWrapper() const
+    {
+        return mNvml;
+    }
+
+private:
+    std::shared_ptr<NVMLWrapper> mNvml;
+};
+
+} // namespace common
+
+TRTLLM_NAMESPACE_END
+
+#endif // NVML_WRAPPER_H
--- a/cpp/tensorrt_llm/common/opUtils.h
+++ b/cpp/tensorrt_llm/common/opUtils.h
@ -38,6 +38,8 @@
 #include <string>
 #include <unordered_map>

+#include "tensorrt_llm/common/nvmlWrapper.h"
+
 TRTLLM_NAMESPACE_BEGIN

 namespace common::op
@ -319,7 +321,8 @@ TRTLLM_NAMESPACE_END
        nvmlReturn_t r = cmd;                                                                                          \
        if (r != NVML_SUCCESS)                                                                                         \
        {                                                                                                              \
-            printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r));                         \
+            printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__,                                              \
+                tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r));                                 \
            exit(EXIT_FAILURE);                                                                                        \
        }                                                                                                              \
    } while (0)
@ -330,6 +333,7 @@ TRTLLM_NAMESPACE_END
        nvmlReturn_t r = cmd;                                                                                          \
        if (TLLM_UNLIKELY(r != NVML_SUCCESS))                                                                          \
        {                                                                                                              \
-            TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r));                     \
+            TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__,                                          \
+                tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r));                                 \
        }                                                                                                              \
    } while (0)
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@ -53,7 +53,6 @@ target_link_libraries(
         ${TORCH_LIBRARIES}
         torch_python
         ${CUDA_DRV_LIB}
-         ${CUDA_NVML_LIB}
         th_common
         pg_utils)
 target_compile_definitions(
--- a/cpp/tensorrt_llm/plugins/CMakeLists.txt
+++ b/cpp/tensorrt_llm/plugins/CMakeLists.txt
@ -170,7 +170,6 @@ target_link_libraries(
  ${CUBLASLT_LIB}
  ${TRT_LIB}
  ${CUDA_DRV_LIB}
-  ${CUDA_NVML_LIB}
  ${CUDA_RT_LIB}
  ${CMAKE_DL_LIBS}
  ${SHARED_TARGET})
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@ -19,6 +19,7 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/dataType.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #include "tensorrt_llm/kernels/customAllReduceKernels.h"
 #include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
@ -601,19 +602,8 @@ bool AllreducePlugin::isCustomAllReduceSupported(int ranks_per_node) const noexc
        && (static_cast<size_t>(ranks_per_node) <= kernels::MAX_RANKS_PER_NODE) && (ranks_per_node > 0);
 }

-class NvmlManager
-{
-public:
-    NvmlManager()
-    {
-        NVML_CHECK(nvmlInit());
-    }
-
-    ~NvmlManager()
-    {
-        NVML_CHECK(nvmlShutdown());
-    }
-};
+using tensorrt_llm::common::NvmlManager;
+using tensorrt_llm::common::NVMLWrapper;

 std::set<int> getLocalGroup(std::set<int> const& group)
 {
@ -711,6 +701,7 @@ void AllreducePlugin::setGroupTopology() noexcept
    TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);

    NvmlManager nvmlManager;
+    auto const& nvml = nvmlManager.sharedWrapper();
    std::unordered_set<int> visitedDevice;
    mIsP2PSupported = true;
    mIsNVLINKSupported = true;
@ -738,26 +729,26 @@ void AllreducePlugin::setGroupTopology() noexcept
            }

            nvmlDevice_t firstDevice;
-            NVML_CHECK(nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));
+            NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));

            bool isNVLINK = false;

            for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
            {
                nvmlPciInfo_t remotePciInfo;
-                if (nvmlDeviceGetNvLinkRemotePciInfo_v2(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
+                if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
                {
                    continue;
                }

                nvmlDevice_t remoteDevice;
-                auto const result = nvmlDeviceGetHandleByPciBusId_v2(remotePciInfo.busId, &remoteDevice);
+                auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remotePciInfo.busId, &remoteDevice);

                if (result == NVML_SUCCESS)
                {
                    // Two GPUs are connected directly through nvlink
                    unsigned int remoteDeviceId;
-                    NVML_CHECK(nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));
+                    NVML_CHECK(nvml->nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));

                    if (remoteDeviceId == static_cast<unsigned int>(secondDeviceId))
                    {
@ -770,12 +761,12 @@ void AllreducePlugin::setGroupTopology() noexcept
                    // now remotePciInfo represents the pci information of nvswitch,
                    // determine whether nvlink is supported by whether two GPUs are connected to the same nvswitch.
                    nvmlDevice_t secondDevice;
-                    NVML_CHECK(nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));
+                    NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));

                    for (unsigned int secondLink = 0; secondLink < NVML_NVLINK_MAX_LINKS; secondLink++)
                    {
                        nvmlPciInfo_t secondRemotePciInfo;
-                        if (nvmlDeviceGetNvLinkRemotePciInfo_v2(secondDevice, secondLink, &secondRemotePciInfo)
+                        if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(secondDevice, secondLink, &secondRemotePciInfo)
                            != NVML_SUCCESS)
                        {
                            continue;
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@ -81,7 +81,6 @@ set_property(TARGET runtime_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 add_cuda_architectures(runtime_src 89)

 target_include_directories(runtime_src PRIVATE ${MPI_C_INCLUDE_DIRS})
-target_link_libraries(runtime_src PUBLIC ${CUDA_NVML_LIB})

 if(ENABLE_MULTI_DEVICE)
  target_link_libraries(runtime_src PUBLIC ${NCCL_LIB})
--- a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
+++ b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
@ -23,7 +23,7 @@
 #include <nvshmem/nvshmemx.h>
 #endif
 #if ENABLE_MULTI_DEVICE
-#include <nvml.h>
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #endif
 #include <unistd.h>

@ -46,7 +46,8 @@
        nvmlReturn_t retval = cmd;                                                                                     \
        if (retval != NVML_SUCCESS)                                                                                    \
        {                                                                                                              \
-            printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(retval));                    \
+            printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__,                                              \
+                tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(retval));                            \
            exit(EXIT_FAILURE);                                                                                        \
        }                                                                                                              \
    } while (0)
@ -329,18 +330,41 @@ private:
            return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
        }

+        auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
+        tensorrt_llm::common::NvmlManager nvmlManager;
+
        nvmlDevice_t nvml_device;
-        nvmlGpuFabricInfo_t fabric_info;
-        NVMLCHECK(nvmlInit_v2());
-        NVMLCHECK(nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
-        NVMLCHECK(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
-        NVMLCHECK(nvmlShutdown());
+        NVMLCHECK(nvml->nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
+
+        nvmlGpuFabricState_t fabric_state;
+        nvmlReturn_t fabric_status;
+        if (nvml->hasGpuFabricInfoV())
+        {
+            nvmlGpuFabricInfoV_t fabric_info_v;
+            memset(&fabric_info_v, 0, sizeof(fabric_info_v));
+            fabric_info_v.version = nvmlGpuFabricInfo_v2;
+            NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfoV(nvml_device, &fabric_info_v));
+            fabric_state = fabric_info_v.state;
+            fabric_status = fabric_info_v.status;
+        }
+        else if (nvml->hasGpuFabricInfo())
+        {
+            nvmlGpuFabricInfo_t fabric_info;
+            NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
+            fabric_state = fabric_info.state;
+            fabric_status = fabric_info.status;
+        }
+        else
+        {
+            TLLM_LOG_TRACE("checking fabric support... NVML fabric info APIs not available.");
+            return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+        }

        // Check if the fabric is fully initialized.
-        if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS)
+        if (fabric_state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_status != NVML_SUCCESS)
        {
-            TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.",
-                fabric_info.state, fabric_info.status);
+            TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.", fabric_state,
+                fabric_status);
            return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
        }

@ -381,8 +405,7 @@ private:
            return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
        }

-        TLLM_LOG_TRACE("fabric status: state=%u status=%u clique=%u", device_id, fabric_info.state, fabric_info.status,
-            fabric_info.cliqueId);
+        TLLM_LOG_TRACE("fabric status: state=%u status=%u", device_id, fabric_state, fabric_status);

        CUCHECK(cuMemRelease(handle));
        // If we get here, fabric handles are supported.
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@ -134,8 +134,7 @@ endif()

 if(ENABLE_MULTI_DEVICE)
  target_include_directories(th_common PUBLIC ${MPI_C_INCLUDE_DIRS})
-  target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB}
-                                          CUDA::nvml)
+  target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB})
 endif()

 if(NOT WIN32)
--- a/cpp/tensorrt_llm/thop/allreduceOp.cpp
+++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@ -21,6 +21,7 @@
 #include "tensorrt_llm/common/dataType.h"
 #include "tensorrt_llm/common/mcastDevMemUtils.h"
 #include "tensorrt_llm/common/ncclUtils.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #include "tensorrt_llm/common/opUtils.h"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
 #include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
@ -85,19 +86,8 @@ struct overloaded : Ts...
 template <class... Ts>
 overloaded(Ts...) -> overloaded<Ts...>;

-class NvmlManager
-{
-public:
-    NvmlManager()
-    {
-        NVML_CHECK_THROW(nvmlInit());
-    }
-
-    ~NvmlManager()
-    {
-        NVML_CHECK(nvmlShutdown());
-    }
-};
+using tensorrt_llm::common::NvmlManager;
+using tensorrt_llm::common::NVMLWrapper;

 std::set<int> getLocalGroup(std::set<int> const& group)
 {
@ -965,7 +955,7 @@ private:
        MNNVLFabricInfo info;

 #if ENABLE_MULTI_DEVICE
-        // 1. Check CUDA driver version (needs >= 12.0.10)
+        // Check CUDA driver version (needs >= 12.0.10)
        int cudaDriverVersion = -1;
        TLLM_CUDA_CHECK(cudaDriverGetVersion(&cudaDriverVersion));
        if (cudaDriverVersion < 12010)
@ -974,7 +964,7 @@ private:
            return info;
        }

-        // 2. Check multicast support
+        // Check multicast support
        CUdevice cuDevice;
        TLLM_CU_CHECK(cuDeviceGet(&cuDevice, deviceId));
        auto cudaDriver = tensorrt_llm::common::CUDADriverWrapper::getInstance();
@ -988,7 +978,7 @@ private:
            return info;
        }

-        // 3. Check fabric handle support
+        // Check fabric handle support
        int fabricHandleSupported = 0;
        TLLM_CU_CHECK(cudaDriver->cuDeviceGetAttribute(
            &fabricHandleSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDevice));
@ -998,9 +988,10 @@ private:
            return info;
        }

-        // 4. Check NVML GPU Fabric Info using versioned API
+        // Check NVML GPU Fabric Info using versioned API (runtime dispatch)
+        auto nvml = NVMLWrapper::getInstance();
        nvmlDevice_t nvmlDevice;
-        nvmlReturn_t nvmlResult = nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
+        nvmlReturn_t nvmlResult = nvml->nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
        if (nvmlResult != NVML_SUCCESS)
        {
            TLLM_LOG_DEBUG("MNNVL check: Failed to get NVML device handle for device %d - error=%d", deviceId,
@ -1008,24 +999,48 @@ private:
            return info;
        }

-        nvmlGpuFabricInfoV_t fabricInfoV;
-        std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
-        fabricInfoV.version = NVML_STRUCT_VERSION(GpuFabricInfo, 3);
-        nvmlResult = nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
+        nvmlGpuFabricState_t fabricState;
+        nvmlReturn_t fabricStatus;
+        unsigned char fabricClusterUuid[NVML_GPU_FABRIC_UUID_LEN];
+        unsigned int fabricCliqueId;
+        if (nvml->hasGpuFabricInfoV())
+        {
+            nvmlGpuFabricInfoV_t fabricInfoV;
+            std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
+            fabricInfoV.version = nvmlGpuFabricInfo_v2;
+            nvmlResult = nvml->nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
+            fabricState = fabricInfoV.state;
+            fabricStatus = fabricInfoV.status;
+            std::memcpy(fabricClusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+            fabricCliqueId = fabricInfoV.cliqueId;
+        }
+        else if (nvml->hasGpuFabricInfo())
+        {
+            nvmlGpuFabricInfo_t fabricInfoLegacy;
+            nvmlResult = nvml->nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfoLegacy);
+            fabricState = fabricInfoLegacy.state;
+            fabricStatus = fabricInfoLegacy.status;
+            std::memcpy(fabricClusterUuid, fabricInfoLegacy.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+            fabricCliqueId = fabricInfoLegacy.cliqueId;
+        }
+        else
+        {
+            TLLM_LOG_DEBUG("MNNVL check: Neither nvmlDeviceGetGpuFabricInfoV nor nvmlDeviceGetGpuFabricInfo available");
+            return info;
+        }
        if (nvmlResult != NVML_SUCCESS)
        {
            TLLM_LOG_DEBUG(
-                "MNNVL check: nvmlDeviceGetGpuFabricInfoV failed for device %d - error=%d (not supported or "
+                "MNNVL check: nvmlDeviceGetGpuFabricInfo failed for device %d - error=%d (not supported or "
                "no fabric manager)",
                deviceId, static_cast<int>(nvmlResult));
            return info;
        }

        // Check if fabric is fully initialized
-        if (fabricInfoV.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfoV.status != NVML_SUCCESS)
+        if (fabricState != NVML_GPU_FABRIC_STATE_COMPLETED || fabricStatus != NVML_SUCCESS)
        {
-            TLLM_LOG_DEBUG(
-                "MNNVL check: Fabric state not complete - state=%u status=%u", fabricInfoV.state, fabricInfoV.status);
+            TLLM_LOG_DEBUG("MNNVL check: Fabric state not complete - state=%u status=%u", fabricState, fabricStatus);
            return info;
        }

@ -1034,7 +1049,7 @@ private:
        bool clusterUuidValid = false;
        for (int i = 0; i < NVML_GPU_FABRIC_UUID_LEN; ++i)
        {
-            if (fabricInfoV.clusterUuid[i] != 0)
+            if (fabricClusterUuid[i] != 0)
            {
                clusterUuidValid = true;
                break;
@ -1047,7 +1062,7 @@ private:
            return info;
        }

-        // 5. Check NVLink links are active (similar to Python support_nvlink(True))
+        // Check NVLink links are active (similar to Python support_nvlink(True))
        unsigned int activeLinks = 0;
        unsigned int availableLinks = 0;

@ -1055,12 +1070,12 @@ private:
        {
            unsigned int capP2p = 0;
            nvmlReturn_t capResult
-                = nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
+                = nvml->nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
            if (capResult == NVML_SUCCESS && capP2p)
            {
                availableLinks++;
                nvmlEnableState_t linkState;
-                if (nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
+                if (nvml->nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
                    && linkState == NVML_FEATURE_ENABLED)
                {
                    activeLinks++;
@ -1077,12 +1092,12 @@ private:
        }

        // Device supports MNNVL - copy fabric info
-        std::memcpy(info.clusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
-        info.cliqueId = fabricInfoV.cliqueId;
+        std::memcpy(info.clusterUuid, fabricClusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+        info.cliqueId = fabricCliqueId;
        info.isValid = true;

        TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (clusterUuid=%s, cliqueId=%u)", deviceId,
-            info.getClusterUuidString().c_str(), fabricInfoV.cliqueId);
+            info.getClusterUuidString().c_str(), fabricCliqueId);
 #endif
        return info;
    }
@ -1104,6 +1119,7 @@ private:
        bool is_inter_node = (mGroup.size() != local_group.size());

        NvmlManager nvml_manager;
+        auto const& nvml = nvml_manager.sharedWrapper();
        mIsP2PSupported = true;
        mIsNVLINKSupported = true;
        mIsMNNVLSupported = false;
@ -1134,26 +1150,27 @@ private:
                    }

                    nvmlDevice_t first_device;
-                    NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
+                    NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(first_device_id, &first_device));

                    bool is_NVLINK = false;

                    for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
                    {
                        nvmlPciInfo_t remote_pci_info;
-                        if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
+                        if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(first_device, link, &remote_pci_info)
+                            != NVML_SUCCESS)
                        {
                            continue;
                        }

                        nvmlDevice_t remote_device;
-                        auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
+                        auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remote_pci_info.busId, &remote_device);

                        if (result == NVML_SUCCESS)
                        {
                            // Two GPUs are connected directly through nvlink
                            unsigned int remote_device_id;
-                            NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
+                            NVML_CHECK_THROW(nvml->nvmlDeviceGetIndex(remote_device, &remote_device_id));

                            if (remote_device_id == static_cast<unsigned int>(second_device_id))
                            {
@ -1167,12 +1184,12 @@ private:
                            // determine whether nvlink is supported by whether two GPUs are connected to the same
                            // nvswitch.
                            nvmlDevice_t second_device;
-                            NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
+                            NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(second_device_id, &second_device));

                            for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
                            {
                                nvmlPciInfo_t second_remote_pci_info;
-                                if (nvmlDeviceGetNvLinkRemotePciInfo_v2(
+                                if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(
                                        second_device, second_link, &second_remote_pci_info)
                                    != NVML_SUCCESS)
                                {
--- a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
+++ b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
@ -18,6 +18,7 @@
 #include <nvml.h>

 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/tllmBuffers.h"
 #include "tensorrt_llm/runtime/virtualMemory.h"
@ -57,9 +58,11 @@ protected:
        TLLM_CU_CHECK(cuDevicePrimaryCtxRetain(&ctx, dev));
        TLLM_CU_CHECK(cuCtxSetCurrent(ctx));

-        // Initialize NVML
-        nvmlReturn_t nvmlResult = nvmlInit();
-        TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", nvmlErrorString(nvmlResult));
+        // Initialize NVML via wrapper
+        mNvml = tensorrt_llm::common::NVMLWrapper::getInstance();
+        nvmlReturn_t nvmlResult = mNvml->nvmlInit();
+        TLLM_CHECK_WITH_INFO(
+            nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", mNvml->nvmlErrorString(nvmlResult));

        if (!memoryInfoAvailable())
        {
@ -88,14 +91,16 @@ protected:

    static size_t getCurrentProcessMemoryInfo()
    {
+        auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
+
        // Get current process ID
        uint32_t currentPid = static_cast<uint32_t>(getpid());

        // Get device handle for GPU 0
        nvmlDevice_t device;
-        auto nvmlResult = nvmlDeviceGetHandleByIndex(0, &device);
+        auto nvmlResult = nvml->nvmlDeviceGetHandleByIndex(0, &device);
        TLLM_CHECK_WITH_INFO(
-            nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvmlErrorString(nvmlResult));
+            nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvml->nvmlErrorString(nvmlResult));

        // Get running processes
        unsigned int processCount = 1;
@ -103,9 +108,9 @@ protected:
        nvmlResult = NVML_ERROR_INSUFFICIENT_SIZE;
        while (nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE)
        {
-            nvmlResult = nvmlDeviceGetComputeRunningProcesses_v3(device, &processCount, processes.data());
+            nvmlResult = nvml->nvmlDeviceGetComputeRunningProcesses(device, &processCount, processes.data());
            TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS || nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE,
-                "Failed to get process count: %s", nvmlErrorString(nvmlResult));
+                "Failed to get process count: %s", nvml->nvmlErrorString(nvmlResult));
            processes.resize(processCount);
        }

@ -120,6 +125,8 @@ protected:

        return 0;
    }
+
+    std::shared_ptr<tensorrt_llm::common::NVMLWrapper> mNvml;
 };

 class VirtualMemoryTest : public VirtualMemoryTestBase