diff --git a/cpp/tensorrt_llm/common/nvmlWrapper.cpp b/cpp/tensorrt_llm/common/nvmlWrapper.cpp
new file mode 100644
index 0000000000..0f647cbe55
--- /dev/null
+++ b/cpp/tensorrt_llm/common/nvmlWrapper.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <dlfcn.h>
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
+
+#include <mutex>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+
+std::shared_ptr<NVMLWrapper> NVMLWrapper::getInstance()
+{
+    static std::mutex mutex;
+    static std::weak_ptr<NVMLWrapper> instance;
+    std::shared_ptr<NVMLWrapper> result = instance.lock();
+    if (result)
+    {
+        return result;
+    }
+
+    std::lock_guard<std::mutex> const lock(mutex);
+    result = instance.lock();
+    if (!result)
+    {
+        result = std::shared_ptr<NVMLWrapper>(new NVMLWrapper());
+        instance = result;
+    }
+    return result;
+}
+
+NVMLWrapper::NVMLWrapper()
+    : mHandle(dlopen("libnvidia-ml.so.1", RTLD_LAZY))
+{
+    TLLM_CHECK_WITH_INFO(mHandle != nullptr, "NVML library (libnvidia-ml.so.1) could not be loaded.");
+
+    auto loadSym = [](void* handle, char const* name) -> void* { return dlsym(handle, name); };
+
+    auto loadRequired = [&](void* handle, char const* name) -> void*
+    {
+        void* sym = loadSym(handle, name);
+        TLLM_CHECK_WITH_INFO(sym != nullptr, "Required NVML symbol not found: %s", name);
+        return sym;
+    };
+
+    *reinterpret_cast<void**>(&_nvmlInit) = loadRequired(mHandle, "nvmlInit_v2");
+    *reinterpret_cast<void**>(&_nvmlShutdown) = loadRequired(mHandle, "nvmlShutdown");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetHandleByIndex) = loadRequired(mHandle, "nvmlDeviceGetHandleByIndex_v2");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetHandleByPciBusId)
+        = loadRequired(mHandle, "nvmlDeviceGetHandleByPciBusId_v2");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetIndex) = loadRequired(mHandle, "nvmlDeviceGetIndex");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkRemotePciInfo)
+        = loadRequired(mHandle, "nvmlDeviceGetNvLinkRemotePciInfo_v2");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkCapability) = loadRequired(mHandle, "nvmlDeviceGetNvLinkCapability");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkState) = loadRequired(mHandle, "nvmlDeviceGetNvLinkState");
+    *reinterpret_cast<void**>(&_nvmlErrorString) = loadRequired(mHandle, "nvmlErrorString");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetComputeRunningProcesses)
+        = loadRequired(mHandle, "nvmlDeviceGetComputeRunningProcesses_v3");
+
+    // Optional symbols - nullptr is OK (older drivers may not have these)
+    *reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfoV) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfoV");
+    *reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfo) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfo");
+
+    if (!_nvmlDeviceGetGpuFabricInfoV)
+    {
+        TLLM_LOG_INFO(
+            "NVML symbol nvmlDeviceGetGpuFabricInfoV not available (older driver). MNNVL fabric detection will use "
+            "legacy API or be disabled.");
+    }
+    if (!_nvmlDeviceGetGpuFabricInfo)
+    {
+        TLLM_LOG_INFO("NVML symbol nvmlDeviceGetGpuFabricInfo not available.");
+    }
+}
+
+NVMLWrapper::~NVMLWrapper()
+{
+    dlclose(mHandle);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlInit() const
+{
+    return (*_nvmlInit)();
+}
+
+nvmlReturn_t NVMLWrapper::nvmlShutdown() const
+{
+    return (*_nvmlShutdown)();
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const
+{
+    return (*_nvmlDeviceGetHandleByIndex)(index, device);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const
+{
+    return (*_nvmlDeviceGetHandleByPciBusId)(pciBusId, device);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const
+{
+    return (*_nvmlDeviceGetIndex)(device, index);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkRemotePciInfo(
+    nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const
+{
+    return (*_nvmlDeviceGetNvLinkRemotePciInfo)(device, link, pci);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkCapability(
+    nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const
+{
+    return (*_nvmlDeviceGetNvLinkCapability)(device, link, capability, capResult);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkState(
+    nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const
+{
+    return (*_nvmlDeviceGetNvLinkState)(device, link, isActive);
+}
+
+char const* NVMLWrapper::nvmlErrorString(nvmlReturn_t result) const
+{
+    return (*_nvmlErrorString)(result);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const
+{
+    if (!_nvmlDeviceGetGpuFabricInfoV)
+    {
+        return NVML_ERROR_FUNCTION_NOT_FOUND;
+    }
+    return (*_nvmlDeviceGetGpuFabricInfoV)(device, gpuFabricInfo);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const
+{
+    if (!_nvmlDeviceGetGpuFabricInfo)
+    {
+        return NVML_ERROR_FUNCTION_NOT_FOUND;
+    }
+    return (*_nvmlDeviceGetGpuFabricInfo)(device, gpuFabricInfo);
+}
+
+nvmlReturn_t NVMLWrapper::nvmlDeviceGetComputeRunningProcesses(
+    nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const
+{
+    return (*_nvmlDeviceGetComputeRunningProcesses)(device, infoCount, infos);
+}
+
+bool NVMLWrapper::hasGpuFabricInfoV() const
+{
+    return _nvmlDeviceGetGpuFabricInfoV != nullptr;
+}
+
+bool NVMLWrapper::hasGpuFabricInfo() const
+{
+    return _nvmlDeviceGetGpuFabricInfo != nullptr;
+}
+
+} // namespace common
+
+TRTLLM_NAMESPACE_END
diff --git a/cpp/tensorrt_llm/common/nvmlWrapper.h b/cpp/tensorrt_llm/common/nvmlWrapper.h
new file mode 100644
index 0000000000..edb76d9033
--- /dev/null
+++ b/cpp/tensorrt_llm/common/nvmlWrapper.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NVML_WRAPPER_H
+#define NVML_WRAPPER_H
+
+#include "tensorrt_llm/common/config.h"
+
+#include <nvml.h>
+
+#include <memory>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+
+class NVMLWrapper
+{
+public:
+    static std::shared_ptr<NVMLWrapper> getInstance();
+
+    ~NVMLWrapper();
+    NVMLWrapper(NVMLWrapper const&) = delete;
+    NVMLWrapper& operator=(NVMLWrapper const&) = delete;
+    NVMLWrapper(NVMLWrapper&&) = delete;
+    NVMLWrapper& operator=(NVMLWrapper&&) = delete;
+
+    // Required NVML functions
+    nvmlReturn_t nvmlInit() const;
+    nvmlReturn_t nvmlShutdown() const;
+    nvmlReturn_t nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const;
+    nvmlReturn_t nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const;
+    nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const;
+    nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const;
+    nvmlReturn_t nvmlDeviceGetNvLinkCapability(
+        nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const;
+    nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const;
+    char const* nvmlErrorString(nvmlReturn_t result) const;
+    nvmlReturn_t nvmlDeviceGetComputeRunningProcesses(
+        nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const;
+
+    // Optional NVML functions (may be nullptr on older drivers)
+    nvmlReturn_t nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const;
+    nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const;
+
+    // Runtime availability checks
+    bool hasGpuFabricInfoV() const;
+    bool hasGpuFabricInfo() const;
+
+private:
+    void* mHandle;
+    NVMLWrapper();
+
+    // Required function pointers
+    nvmlReturn_t (*_nvmlInit)();
+    nvmlReturn_t (*_nvmlShutdown)();
+    nvmlReturn_t (*_nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t*);
+    nvmlReturn_t (*_nvmlDeviceGetHandleByPciBusId)(char const*, nvmlDevice_t*);
+    nvmlReturn_t (*_nvmlDeviceGetIndex)(nvmlDevice_t, unsigned int*);
+    nvmlReturn_t (*_nvmlDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t, unsigned int, nvmlPciInfo_t*);
+    nvmlReturn_t (*_nvmlDeviceGetNvLinkCapability)(nvmlDevice_t, unsigned int, nvmlNvLinkCapability_t, unsigned int*);
+    nvmlReturn_t (*_nvmlDeviceGetNvLinkState)(nvmlDevice_t, unsigned int, nvmlEnableState_t*);
+    char const* (*_nvmlErrorString)(nvmlReturn_t);
+    nvmlReturn_t (*_nvmlDeviceGetComputeRunningProcesses)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*);
+
+    // Optional function pointers (may be nullptr)
+    nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfoV)(nvmlDevice_t, nvmlGpuFabricInfoV_t*);
+    nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfo)(nvmlDevice_t, nvmlGpuFabricInfo_t*);
+};
+
+// RAII class that initializes NVML on construction and shuts down on destruction.
+// Replaces duplicated NvmlManager classes in allreduceOp.cpp and allreducePlugin.cpp.
+class NvmlManager
+{
+public:
+    NvmlManager()
+        : mNvml(NVMLWrapper::getInstance())
+    {
+        auto result = mNvml->nvmlInit();
+        if (result != NVML_SUCCESS)
+        {
+            TLLM_THROW("Failed to initialize NVML: %s", mNvml->nvmlErrorString(result));
+        }
+    }
+
+    ~NvmlManager()
+    {
+        mNvml->nvmlShutdown();
+    }
+
+    NVMLWrapper const& wrapper() const
+    {
+        return *mNvml;
+    }
+
+    std::shared_ptr<NVMLWrapper> const& sharedWrapper() const
+    {
+        return mNvml;
+    }
+
+private:
+    std::shared_ptr<NVMLWrapper> mNvml;
+};
+
+} // namespace common
+
+TRTLLM_NAMESPACE_END
+
+#endif // NVML_WRAPPER_H
diff --git a/cpp/tensorrt_llm/common/opUtils.h b/cpp/tensorrt_llm/common/opUtils.h
index 3018a5da10..72e5a5ea3e 100644
--- a/cpp/tensorrt_llm/common/opUtils.h
+++ b/cpp/tensorrt_llm/common/opUtils.h
@@ -38,6 +38,8 @@
 #include <string>
 #include <unordered_map>
 
+#include "tensorrt_llm/common/nvmlWrapper.h"
+
 TRTLLM_NAMESPACE_BEGIN
 
 namespace common::op
@@ -319,7 +321,8 @@ TRTLLM_NAMESPACE_END
         nvmlReturn_t r = cmd;                                                                                          \
         if (r != NVML_SUCCESS)                                                                                         \
         {                                                                                                              \
-            printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r));                         \
+            printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__,                                              \
+                tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r));                                 \
             exit(EXIT_FAILURE);                                                                                        \
         }                                                                                                              \
     } while (0)
@@ -330,6 +333,7 @@ TRTLLM_NAMESPACE_END
         nvmlReturn_t r = cmd;                                                                                          \
         if (TLLM_UNLIKELY(r != NVML_SUCCESS))                                                                          \
         {                                                                                                              \
-            TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r));                     \
+            TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__,                                          \
+                tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r));                                 \
         }                                                                                                              \
     } while (0)
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
index 999c2b736c..d079a96f49 100755
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -53,7 +53,6 @@ target_link_libraries(
          ${TORCH_LIBRARIES}
          torch_python
          ${CUDA_DRV_LIB}
-         ${CUDA_NVML_LIB}
          th_common
          pg_utils)
 target_compile_definitions(
diff --git a/cpp/tensorrt_llm/plugins/CMakeLists.txt b/cpp/tensorrt_llm/plugins/CMakeLists.txt
index 3c25440366..8b89cccdc8 100755
--- a/cpp/tensorrt_llm/plugins/CMakeLists.txt
+++ b/cpp/tensorrt_llm/plugins/CMakeLists.txt
@@ -170,7 +170,6 @@ target_link_libraries(
   ${CUBLASLT_LIB}
   ${TRT_LIB}
   ${CUDA_DRV_LIB}
-  ${CUDA_NVML_LIB}
   ${CUDA_RT_LIB}
   ${CMAKE_DL_LIBS}
   ${SHARED_TARGET})
diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
index 112364400d..24d9aff418 100644
--- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/customAllReduceUtils.h"
 #include "tensorrt_llm/common/dataType.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #include "tensorrt_llm/kernels/customAllReduceKernels.h"
 #include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
@@ -601,19 +602,8 @@ bool AllreducePlugin::isCustomAllReduceSupported(int ranks_per_node) const noexc
         && (static_cast<size_t>(ranks_per_node) <= kernels::MAX_RANKS_PER_NODE) && (ranks_per_node > 0);
 }
 
-class NvmlManager
-{
-public:
-    NvmlManager()
-    {
-        NVML_CHECK(nvmlInit());
-    }
-
-    ~NvmlManager()
-    {
-        NVML_CHECK(nvmlShutdown());
-    }
-};
+using tensorrt_llm::common::NvmlManager;
+using tensorrt_llm::common::NVMLWrapper;
 
 std::set<int> getLocalGroup(std::set<int> const& group)
 {
@@ -711,6 +701,7 @@ void AllreducePlugin::setGroupTopology() noexcept
     TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
 
     NvmlManager nvmlManager;
+    auto const& nvml = nvmlManager.sharedWrapper();
     std::unordered_set<int> visitedDevice;
     mIsP2PSupported = true;
     mIsNVLINKSupported = true;
@@ -738,26 +729,26 @@ void AllreducePlugin::setGroupTopology() noexcept
             }
 
             nvmlDevice_t firstDevice;
-            NVML_CHECK(nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));
+            NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));
 
             bool isNVLINK = false;
 
             for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
             {
                 nvmlPciInfo_t remotePciInfo;
-                if (nvmlDeviceGetNvLinkRemotePciInfo_v2(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
+                if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
                 {
                     continue;
                 }
 
                 nvmlDevice_t remoteDevice;
-                auto const result = nvmlDeviceGetHandleByPciBusId_v2(remotePciInfo.busId, &remoteDevice);
+                auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remotePciInfo.busId, &remoteDevice);
 
                 if (result == NVML_SUCCESS)
                 {
                     // Two GPUs are connected directly through nvlink
                     unsigned int remoteDeviceId;
-                    NVML_CHECK(nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));
+                    NVML_CHECK(nvml->nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));
 
                     if (remoteDeviceId == static_cast<unsigned int>(secondDeviceId))
                     {
@@ -770,12 +761,12 @@ void AllreducePlugin::setGroupTopology() noexcept
                     // now remotePciInfo represents the pci information of nvswitch,
                     // determine whether nvlink is supported by whether two GPUs are connected to the same nvswitch.
                     nvmlDevice_t secondDevice;
-                    NVML_CHECK(nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));
+                    NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));
 
                     for (unsigned int secondLink = 0; secondLink < NVML_NVLINK_MAX_LINKS; secondLink++)
                     {
                         nvmlPciInfo_t secondRemotePciInfo;
-                        if (nvmlDeviceGetNvLinkRemotePciInfo_v2(secondDevice, secondLink, &secondRemotePciInfo)
+                        if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(secondDevice, secondLink, &secondRemotePciInfo)
                             != NVML_SUCCESS)
                         {
                             continue;
diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt
index c681e08bdf..ca81fbb0f6 100644
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@@ -81,7 +81,6 @@ set_property(TARGET runtime_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 add_cuda_architectures(runtime_src 89)
 
 target_include_directories(runtime_src PRIVATE ${MPI_C_INCLUDE_DIRS})
-target_link_libraries(runtime_src PUBLIC ${CUDA_NVML_LIB})
 
 if(ENABLE_MULTI_DEVICE)
   target_link_libraries(runtime_src PUBLIC ${NCCL_LIB})
diff --git a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
index 345930ab37..05b0071673 100644
--- a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
+++ b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
@@ -23,7 +23,7 @@
 #include <nvshmem/nvshmemx.h>
 #endif
 #if ENABLE_MULTI_DEVICE
-#include <nvml.h>
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #endif
 #include <unistd.h>
 
@@ -46,7 +46,8 @@
         nvmlReturn_t retval = cmd;                                                                                     \
         if (retval != NVML_SUCCESS)                                                                                    \
         {                                                                                                              \
-            printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(retval));                    \
+            printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__,                                              \
+                tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(retval));                            \
             exit(EXIT_FAILURE);                                                                                        \
         }                                                                                                              \
     } while (0)
@@ -329,18 +330,41 @@ private:
             return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
         }
 
+        auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
+        tensorrt_llm::common::NvmlManager nvmlManager;
+
         nvmlDevice_t nvml_device;
-        nvmlGpuFabricInfo_t fabric_info;
-        NVMLCHECK(nvmlInit_v2());
-        NVMLCHECK(nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
-        NVMLCHECK(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
-        NVMLCHECK(nvmlShutdown());
+        NVMLCHECK(nvml->nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
+
+        nvmlGpuFabricState_t fabric_state;
+        nvmlReturn_t fabric_status;
+        if (nvml->hasGpuFabricInfoV())
+        {
+            nvmlGpuFabricInfoV_t fabric_info_v;
+            memset(&fabric_info_v, 0, sizeof(fabric_info_v));
+            fabric_info_v.version = nvmlGpuFabricInfo_v2;
+            NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfoV(nvml_device, &fabric_info_v));
+            fabric_state = fabric_info_v.state;
+            fabric_status = fabric_info_v.status;
+        }
+        else if (nvml->hasGpuFabricInfo())
+        {
+            nvmlGpuFabricInfo_t fabric_info;
+            NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
+            fabric_state = fabric_info.state;
+            fabric_status = fabric_info.status;
+        }
+        else
+        {
+            TLLM_LOG_TRACE("checking fabric support... NVML fabric info APIs not available.");
+            return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+        }
 
         // Check if the fabric is fully initialized.
-        if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS)
+        if (fabric_state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_status != NVML_SUCCESS)
         {
-            TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.",
-                fabric_info.state, fabric_info.status);
+            TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.", fabric_state,
+                fabric_status);
             return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
         }
 
@@ -381,8 +405,7 @@ private:
             return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
         }
 
-        TLLM_LOG_TRACE("fabric status: state=%u status=%u clique=%u", device_id, fabric_info.state, fabric_info.status,
-            fabric_info.cliqueId);
+        TLLM_LOG_TRACE("fabric status: state=%u status=%u", device_id, fabric_state, fabric_status);
 
         CUCHECK(cuMemRelease(handle));
         // If we get here, fabric handles are supported.
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
index 08c7baf9c6..367d3c5f86 100644
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -134,8 +134,7 @@ endif()
 
 if(ENABLE_MULTI_DEVICE)
   target_include_directories(th_common PUBLIC ${MPI_C_INCLUDE_DIRS})
-  target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB}
-                                          CUDA::nvml)
+  target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB})
 endif()
 
 if(NOT WIN32)
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
index b5dc61d3a0..517d0eb2c3 100644
--- a/cpp/tensorrt_llm/thop/allreduceOp.cpp
+++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/common/dataType.h"
 #include "tensorrt_llm/common/mcastDevMemUtils.h"
 #include "tensorrt_llm/common/ncclUtils.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #include "tensorrt_llm/common/opUtils.h"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
 #include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
@@ -85,19 +86,8 @@ struct overloaded : Ts...
 template <class... Ts>
 overloaded(Ts...) -> overloaded<Ts...>;
 
-class NvmlManager
-{
-public:
-    NvmlManager()
-    {
-        NVML_CHECK_THROW(nvmlInit());
-    }
-
-    ~NvmlManager()
-    {
-        NVML_CHECK(nvmlShutdown());
-    }
-};
+using tensorrt_llm::common::NvmlManager;
+using tensorrt_llm::common::NVMLWrapper;
 
 std::set<int> getLocalGroup(std::set<int> const& group)
 {
@@ -965,7 +955,7 @@ private:
         MNNVLFabricInfo info;
 
 #if ENABLE_MULTI_DEVICE
-        // 1. Check CUDA driver version (needs >= 12.0.10)
+        // Check CUDA driver version (needs >= 12.0.10)
         int cudaDriverVersion = -1;
         TLLM_CUDA_CHECK(cudaDriverGetVersion(&cudaDriverVersion));
         if (cudaDriverVersion < 12010)
@@ -974,7 +964,7 @@ private:
             return info;
         }
 
-        // 2. Check multicast support
+        // Check multicast support
         CUdevice cuDevice;
         TLLM_CU_CHECK(cuDeviceGet(&cuDevice, deviceId));
         auto cudaDriver = tensorrt_llm::common::CUDADriverWrapper::getInstance();
@@ -988,7 +978,7 @@ private:
             return info;
         }
 
-        // 3. Check fabric handle support
+        // Check fabric handle support
         int fabricHandleSupported = 0;
         TLLM_CU_CHECK(cudaDriver->cuDeviceGetAttribute(
             &fabricHandleSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDevice));
@@ -998,9 +988,10 @@ private:
             return info;
         }
 
-        // 4. Check NVML GPU Fabric Info using versioned API
+        // Check NVML GPU Fabric Info using versioned API (runtime dispatch)
+        auto nvml = NVMLWrapper::getInstance();
         nvmlDevice_t nvmlDevice;
-        nvmlReturn_t nvmlResult = nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
+        nvmlReturn_t nvmlResult = nvml->nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
         if (nvmlResult != NVML_SUCCESS)
         {
             TLLM_LOG_DEBUG("MNNVL check: Failed to get NVML device handle for device %d - error=%d", deviceId,
@@ -1008,24 +999,48 @@ private:
             return info;
         }
 
-        nvmlGpuFabricInfoV_t fabricInfoV;
-        std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
-        fabricInfoV.version = NVML_STRUCT_VERSION(GpuFabricInfo, 3);
-        nvmlResult = nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
+        nvmlGpuFabricState_t fabricState;
+        nvmlReturn_t fabricStatus;
+        unsigned char fabricClusterUuid[NVML_GPU_FABRIC_UUID_LEN];
+        unsigned int fabricCliqueId;
+        if (nvml->hasGpuFabricInfoV())
+        {
+            nvmlGpuFabricInfoV_t fabricInfoV;
+            std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
+            fabricInfoV.version = nvmlGpuFabricInfo_v2;
+            nvmlResult = nvml->nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
+            fabricState = fabricInfoV.state;
+            fabricStatus = fabricInfoV.status;
+            std::memcpy(fabricClusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+            fabricCliqueId = fabricInfoV.cliqueId;
+        }
+        else if (nvml->hasGpuFabricInfo())
+        {
+            nvmlGpuFabricInfo_t fabricInfoLegacy;
+            nvmlResult = nvml->nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfoLegacy);
+            fabricState = fabricInfoLegacy.state;
+            fabricStatus = fabricInfoLegacy.status;
+            std::memcpy(fabricClusterUuid, fabricInfoLegacy.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+            fabricCliqueId = fabricInfoLegacy.cliqueId;
+        }
+        else
+        {
+            TLLM_LOG_DEBUG("MNNVL check: Neither nvmlDeviceGetGpuFabricInfoV nor nvmlDeviceGetGpuFabricInfo available");
+            return info;
+        }
         if (nvmlResult != NVML_SUCCESS)
         {
             TLLM_LOG_DEBUG(
-                "MNNVL check: nvmlDeviceGetGpuFabricInfoV failed for device %d - error=%d (not supported or "
+                "MNNVL check: nvmlDeviceGetGpuFabricInfo failed for device %d - error=%d (not supported or "
                 "no fabric manager)",
                 deviceId, static_cast<int>(nvmlResult));
             return info;
         }
 
         // Check if fabric is fully initialized
-        if (fabricInfoV.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfoV.status != NVML_SUCCESS)
+        if (fabricState != NVML_GPU_FABRIC_STATE_COMPLETED || fabricStatus != NVML_SUCCESS)
         {
-            TLLM_LOG_DEBUG(
-                "MNNVL check: Fabric state not complete - state=%u status=%u", fabricInfoV.state, fabricInfoV.status);
+            TLLM_LOG_DEBUG("MNNVL check: Fabric state not complete - state=%u status=%u", fabricState, fabricStatus);
             return info;
         }
 
@@ -1034,7 +1049,7 @@ private:
         bool clusterUuidValid = false;
         for (int i = 0; i < NVML_GPU_FABRIC_UUID_LEN; ++i)
         {
-            if (fabricInfoV.clusterUuid[i] != 0)
+            if (fabricClusterUuid[i] != 0)
             {
                 clusterUuidValid = true;
                 break;
@@ -1047,7 +1062,7 @@ private:
             return info;
         }
 
-        // 5. Check NVLink links are active (similar to Python support_nvlink(True))
+        // Check NVLink links are active (similar to Python support_nvlink(True))
         unsigned int activeLinks = 0;
         unsigned int availableLinks = 0;
 
@@ -1055,12 +1070,12 @@ private:
         {
             unsigned int capP2p = 0;
             nvmlReturn_t capResult
-                = nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
+                = nvml->nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
             if (capResult == NVML_SUCCESS && capP2p)
             {
                 availableLinks++;
                 nvmlEnableState_t linkState;
-                if (nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
+                if (nvml->nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
                     && linkState == NVML_FEATURE_ENABLED)
                 {
                     activeLinks++;
@@ -1077,12 +1092,12 @@ private:
         }
 
         // Device supports MNNVL - copy fabric info
-        std::memcpy(info.clusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
-        info.cliqueId = fabricInfoV.cliqueId;
+        std::memcpy(info.clusterUuid, fabricClusterUuid, NVML_GPU_FABRIC_UUID_LEN);
+        info.cliqueId = fabricCliqueId;
         info.isValid = true;
 
         TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (clusterUuid=%s, cliqueId=%u)", deviceId,
-            info.getClusterUuidString().c_str(), fabricInfoV.cliqueId);
+            info.getClusterUuidString().c_str(), fabricCliqueId);
 #endif
         return info;
     }
@@ -1104,6 +1119,7 @@ private:
         bool is_inter_node = (mGroup.size() != local_group.size());
 
         NvmlManager nvml_manager;
+        auto const& nvml = nvml_manager.sharedWrapper();
         mIsP2PSupported = true;
         mIsNVLINKSupported = true;
         mIsMNNVLSupported = false;
@@ -1134,26 +1150,27 @@ private:
                     }
 
                     nvmlDevice_t first_device;
-                    NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
+                    NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
 
                     bool is_NVLINK = false;
 
                     for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
                     {
                         nvmlPciInfo_t remote_pci_info;
-                        if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
+                        if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(first_device, link, &remote_pci_info)
+                            != NVML_SUCCESS)
                         {
                             continue;
                         }
 
                         nvmlDevice_t remote_device;
-                        auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
+                        auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remote_pci_info.busId, &remote_device);
 
                         if (result == NVML_SUCCESS)
                         {
                             // Two GPUs are connected directly through nvlink
                             unsigned int remote_device_id;
-                            NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
+                            NVML_CHECK_THROW(nvml->nvmlDeviceGetIndex(remote_device, &remote_device_id));
 
                             if (remote_device_id == static_cast<unsigned int>(second_device_id))
                             {
@@ -1167,12 +1184,12 @@ private:
                             // determine whether nvlink is supported by whether two GPUs are connected to the same
                             // nvswitch.
                             nvmlDevice_t second_device;
-                            NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
+                            NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
 
                             for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
                             {
                                 nvmlPciInfo_t second_remote_pci_info;
-                                if (nvmlDeviceGetNvLinkRemotePciInfo_v2(
+                                if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(
                                         second_device, second_link, &second_remote_pci_info)
                                     != NVML_SUCCESS)
                                 {
diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
index a4a6e55e85..85d49433d8 100644
--- a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
+++ b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp
@@ -18,6 +18,7 @@
 #include <nvml.h>
 
 #include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/nvmlWrapper.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/tllmBuffers.h"
 #include "tensorrt_llm/runtime/virtualMemory.h"
@@ -57,9 +58,11 @@ protected:
         TLLM_CU_CHECK(cuDevicePrimaryCtxRetain(&ctx, dev));
         TLLM_CU_CHECK(cuCtxSetCurrent(ctx));
 
-        // Initialize NVML
-        nvmlReturn_t nvmlResult = nvmlInit();
-        TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", nvmlErrorString(nvmlResult));
+        // Initialize NVML via wrapper
+        mNvml = tensorrt_llm::common::NVMLWrapper::getInstance();
+        nvmlReturn_t nvmlResult = mNvml->nvmlInit();
+        TLLM_CHECK_WITH_INFO(
+            nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", mNvml->nvmlErrorString(nvmlResult));
 
         if (!memoryInfoAvailable())
         {
@@ -88,14 +91,16 @@ protected:
 
     static size_t getCurrentProcessMemoryInfo()
     {
+        auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
+
         // Get current process ID
         uint32_t currentPid = static_cast<uint32_t>(getpid());
 
         // Get device handle for GPU 0
         nvmlDevice_t device;
-        auto nvmlResult = nvmlDeviceGetHandleByIndex(0, &device);
+        auto nvmlResult = nvml->nvmlDeviceGetHandleByIndex(0, &device);
         TLLM_CHECK_WITH_INFO(
-            nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvmlErrorString(nvmlResult));
+            nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvml->nvmlErrorString(nvmlResult));
 
         // Get running processes
         unsigned int processCount = 1;
@@ -103,9 +108,9 @@ protected:
         nvmlResult = NVML_ERROR_INSUFFICIENT_SIZE;
         while (nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE)
         {
-            nvmlResult = nvmlDeviceGetComputeRunningProcesses_v3(device, &processCount, processes.data());
+            nvmlResult = nvml->nvmlDeviceGetComputeRunningProcesses(device, &processCount, processes.data());
             TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS || nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE,
-                "Failed to get process count: %s", nvmlErrorString(nvmlResult));
+                "Failed to get process count: %s", nvml->nvmlErrorString(nvmlResult));
             processes.resize(processCount);
         }
 
@@ -120,6 +125,8 @@ protected:
 
         return 0;
     }
+
+    std::shared_ptr<tensorrt_llm::common::NVMLWrapper> mNvml;
 };
 
 class VirtualMemoryTest : public VirtualMemoryTestBase