diff --git a/cpp/tensorrt_llm/common/nvmlWrapper.cpp b/cpp/tensorrt_llm/common/nvmlWrapper.cpp new file mode 100644 index 0000000000..0f647cbe55 --- /dev/null +++ b/cpp/tensorrt_llm/common/nvmlWrapper.cpp @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/common/nvmlWrapper.h" + +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ + +std::shared_ptr NVMLWrapper::getInstance() +{ + static std::mutex mutex; + static std::weak_ptr instance; + std::shared_ptr result = instance.lock(); + if (result) + { + return result; + } + + std::lock_guard const lock(mutex); + result = instance.lock(); + if (!result) + { + result = std::shared_ptr(new NVMLWrapper()); + instance = result; + } + return result; +} + +NVMLWrapper::NVMLWrapper() + : mHandle(dlopen("libnvidia-ml.so.1", RTLD_LAZY)) +{ + TLLM_CHECK_WITH_INFO(mHandle != nullptr, "NVML library (libnvidia-ml.so.1) could not be loaded."); + + auto loadSym = [](void* handle, char const* name) -> void* { return dlsym(handle, name); }; + + auto loadRequired = [&](void* handle, char const* name) -> void* + { + void* sym = loadSym(handle, name); + TLLM_CHECK_WITH_INFO(sym != nullptr, "Required NVML symbol not found: %s", name); + return sym; + }; + + *reinterpret_cast(&_nvmlInit) = loadRequired(mHandle, "nvmlInit_v2"); + *reinterpret_cast(&_nvmlShutdown) = loadRequired(mHandle, "nvmlShutdown"); + *reinterpret_cast(&_nvmlDeviceGetHandleByIndex) = loadRequired(mHandle, "nvmlDeviceGetHandleByIndex_v2"); + *reinterpret_cast(&_nvmlDeviceGetHandleByPciBusId) + = loadRequired(mHandle, "nvmlDeviceGetHandleByPciBusId_v2"); + *reinterpret_cast(&_nvmlDeviceGetIndex) = loadRequired(mHandle, "nvmlDeviceGetIndex"); + *reinterpret_cast(&_nvmlDeviceGetNvLinkRemotePciInfo) + = loadRequired(mHandle, "nvmlDeviceGetNvLinkRemotePciInfo_v2"); + *reinterpret_cast(&_nvmlDeviceGetNvLinkCapability) = loadRequired(mHandle, "nvmlDeviceGetNvLinkCapability"); + *reinterpret_cast(&_nvmlDeviceGetNvLinkState) = loadRequired(mHandle, "nvmlDeviceGetNvLinkState"); + *reinterpret_cast(&_nvmlErrorString) = loadRequired(mHandle, "nvmlErrorString"); + *reinterpret_cast(&_nvmlDeviceGetComputeRunningProcesses) + = loadRequired(mHandle, "nvmlDeviceGetComputeRunningProcesses_v3"); + + // Optional symbols - nullptr is OK (older drivers may not have these) + *reinterpret_cast(&_nvmlDeviceGetGpuFabricInfoV) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfoV"); + *reinterpret_cast(&_nvmlDeviceGetGpuFabricInfo) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfo"); + + if (!_nvmlDeviceGetGpuFabricInfoV) + { + TLLM_LOG_INFO( + "NVML symbol nvmlDeviceGetGpuFabricInfoV not available (older driver). MNNVL fabric detection will use " + "legacy API or be disabled."); + } + if (!_nvmlDeviceGetGpuFabricInfo) + { + TLLM_LOG_INFO("NVML symbol nvmlDeviceGetGpuFabricInfo not available."); + } +} + +NVMLWrapper::~NVMLWrapper() +{ + dlclose(mHandle); +} + +nvmlReturn_t NVMLWrapper::nvmlInit() const +{ + return (*_nvmlInit)(); +} + +nvmlReturn_t NVMLWrapper::nvmlShutdown() const +{ + return (*_nvmlShutdown)(); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const +{ + return (*_nvmlDeviceGetHandleByIndex)(index, device); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const +{ + return (*_nvmlDeviceGetHandleByPciBusId)(pciBusId, device); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const +{ + return (*_nvmlDeviceGetIndex)(device, index); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkRemotePciInfo( + nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const +{ + return (*_nvmlDeviceGetNvLinkRemotePciInfo)(device, link, pci); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkCapability( + nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const +{ + return (*_nvmlDeviceGetNvLinkCapability)(device, link, capability, capResult); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkState( + nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const +{ + return (*_nvmlDeviceGetNvLinkState)(device, link, isActive); +} + +char const* NVMLWrapper::nvmlErrorString(nvmlReturn_t result) const +{ + return (*_nvmlErrorString)(result); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const +{ + if (!_nvmlDeviceGetGpuFabricInfoV) + { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return (*_nvmlDeviceGetGpuFabricInfoV)(device, gpuFabricInfo); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const +{ + if (!_nvmlDeviceGetGpuFabricInfo) + { + return NVML_ERROR_FUNCTION_NOT_FOUND; + } + return (*_nvmlDeviceGetGpuFabricInfo)(device, gpuFabricInfo); +} + +nvmlReturn_t NVMLWrapper::nvmlDeviceGetComputeRunningProcesses( + nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const +{ + return (*_nvmlDeviceGetComputeRunningProcesses)(device, infoCount, infos); +} + +bool NVMLWrapper::hasGpuFabricInfoV() const +{ + return _nvmlDeviceGetGpuFabricInfoV != nullptr; +} + +bool NVMLWrapper::hasGpuFabricInfo() const +{ + return _nvmlDeviceGetGpuFabricInfo != nullptr; +} + +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/nvmlWrapper.h b/cpp/tensorrt_llm/common/nvmlWrapper.h new file mode 100644 index 0000000000..edb76d9033 --- /dev/null +++ b/cpp/tensorrt_llm/common/nvmlWrapper.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef NVML_WRAPPER_H +#define NVML_WRAPPER_H + +#include "tensorrt_llm/common/config.h" + +#include + +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ + +class NVMLWrapper +{ +public: + static std::shared_ptr getInstance(); + + ~NVMLWrapper(); + NVMLWrapper(NVMLWrapper const&) = delete; + NVMLWrapper& operator=(NVMLWrapper const&) = delete; + NVMLWrapper(NVMLWrapper&&) = delete; + NVMLWrapper& operator=(NVMLWrapper&&) = delete; + + // Required NVML functions + nvmlReturn_t nvmlInit() const; + nvmlReturn_t nvmlShutdown() const; + nvmlReturn_t nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const; + nvmlReturn_t nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const; + nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const; + nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const; + nvmlReturn_t nvmlDeviceGetNvLinkCapability( + nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const; + nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const; + char const* nvmlErrorString(nvmlReturn_t result) const; + nvmlReturn_t nvmlDeviceGetComputeRunningProcesses( + nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const; + + // Optional NVML functions (may be nullptr on older drivers) + nvmlReturn_t nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const; + nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const; + + // Runtime availability checks + bool hasGpuFabricInfoV() const; + bool hasGpuFabricInfo() const; + +private: + void* mHandle; + NVMLWrapper(); + + // Required function pointers + nvmlReturn_t (*_nvmlInit)(); + nvmlReturn_t (*_nvmlShutdown)(); + nvmlReturn_t (*_nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t*); + nvmlReturn_t (*_nvmlDeviceGetHandleByPciBusId)(char const*, nvmlDevice_t*); + nvmlReturn_t (*_nvmlDeviceGetIndex)(nvmlDevice_t, unsigned int*); + nvmlReturn_t (*_nvmlDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t, unsigned int, nvmlPciInfo_t*); + nvmlReturn_t (*_nvmlDeviceGetNvLinkCapability)(nvmlDevice_t, unsigned int, nvmlNvLinkCapability_t, unsigned int*); + nvmlReturn_t (*_nvmlDeviceGetNvLinkState)(nvmlDevice_t, unsigned int, nvmlEnableState_t*); + char const* (*_nvmlErrorString)(nvmlReturn_t); + nvmlReturn_t (*_nvmlDeviceGetComputeRunningProcesses)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*); + + // Optional function pointers (may be nullptr) + nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfoV)(nvmlDevice_t, nvmlGpuFabricInfoV_t*); + nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfo)(nvmlDevice_t, nvmlGpuFabricInfo_t*); +}; + +// RAII class that initializes NVML on construction and shuts down on destruction. +// Replaces duplicated NvmlManager classes in allreduceOp.cpp and allreducePlugin.cpp. +class NvmlManager +{ +public: + NvmlManager() + : mNvml(NVMLWrapper::getInstance()) + { + auto result = mNvml->nvmlInit(); + if (result != NVML_SUCCESS) + { + TLLM_THROW("Failed to initialize NVML: %s", mNvml->nvmlErrorString(result)); + } + } + + ~NvmlManager() + { + mNvml->nvmlShutdown(); + } + + NVMLWrapper const& wrapper() const + { + return *mNvml; + } + + std::shared_ptr const& sharedWrapper() const + { + return mNvml; + } + +private: + std::shared_ptr mNvml; +}; + +} // namespace common + +TRTLLM_NAMESPACE_END + +#endif // NVML_WRAPPER_H diff --git a/cpp/tensorrt_llm/common/opUtils.h b/cpp/tensorrt_llm/common/opUtils.h index 3018a5da10..72e5a5ea3e 100644 --- a/cpp/tensorrt_llm/common/opUtils.h +++ b/cpp/tensorrt_llm/common/opUtils.h @@ -38,6 +38,8 @@ #include #include +#include "tensorrt_llm/common/nvmlWrapper.h" + TRTLLM_NAMESPACE_BEGIN namespace common::op @@ -319,7 +321,8 @@ TRTLLM_NAMESPACE_END nvmlReturn_t r = cmd; \ if (r != NVML_SUCCESS) \ { \ - printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \ + printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, \ + tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r)); \ exit(EXIT_FAILURE); \ } \ } while (0) @@ -330,6 +333,7 @@ TRTLLM_NAMESPACE_END nvmlReturn_t r = cmd; \ if (TLLM_UNLIKELY(r != NVML_SUCCESS)) \ { \ - TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \ + TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, \ + tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r)); \ } \ } while (0) diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt index 999c2b736c..d079a96f49 100755 --- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt +++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt @@ -53,7 +53,6 @@ target_link_libraries( ${TORCH_LIBRARIES} torch_python ${CUDA_DRV_LIB} - ${CUDA_NVML_LIB} th_common pg_utils) target_compile_definitions( diff --git a/cpp/tensorrt_llm/plugins/CMakeLists.txt b/cpp/tensorrt_llm/plugins/CMakeLists.txt index 3c25440366..8b89cccdc8 100755 --- a/cpp/tensorrt_llm/plugins/CMakeLists.txt +++ b/cpp/tensorrt_llm/plugins/CMakeLists.txt @@ -170,7 +170,6 @@ target_link_libraries( ${CUBLASLT_LIB} ${TRT_LIB} ${CUDA_DRV_LIB} - ${CUDA_NVML_LIB} ${CUDA_RT_LIB} ${CMAKE_DL_LIBS} ${SHARED_TARGET}) diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp index 112364400d..24d9aff418 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp @@ -19,6 +19,7 @@ #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/customAllReduceUtils.h" #include "tensorrt_llm/common/dataType.h" +#include "tensorrt_llm/common/nvmlWrapper.h" #include "tensorrt_llm/kernels/customAllReduceKernels.h" #include "tensorrt_llm/kernels/userbuffers/ub_interface.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" @@ -601,19 +602,8 @@ bool AllreducePlugin::isCustomAllReduceSupported(int ranks_per_node) const noexc && (static_cast(ranks_per_node) <= kernels::MAX_RANKS_PER_NODE) && (ranks_per_node > 0); } -class NvmlManager -{ -public: - NvmlManager() - { - NVML_CHECK(nvmlInit()); - } - - ~NvmlManager() - { - NVML_CHECK(nvmlShutdown()); - } -}; +using tensorrt_llm::common::NvmlManager; +using tensorrt_llm::common::NVMLWrapper; std::set getLocalGroup(std::set const& group) { @@ -711,6 +701,7 @@ void AllreducePlugin::setGroupTopology() noexcept TLLM_LOG_INFO("TP group is intra-node for rank %d", rank); NvmlManager nvmlManager; + auto const& nvml = nvmlManager.sharedWrapper(); std::unordered_set visitedDevice; mIsP2PSupported = true; mIsNVLINKSupported = true; @@ -738,26 +729,26 @@ void AllreducePlugin::setGroupTopology() noexcept } nvmlDevice_t firstDevice; - NVML_CHECK(nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice)); + NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice)); bool isNVLINK = false; for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++) { nvmlPciInfo_t remotePciInfo; - if (nvmlDeviceGetNvLinkRemotePciInfo_v2(firstDevice, link, &remotePciInfo) != NVML_SUCCESS) + if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(firstDevice, link, &remotePciInfo) != NVML_SUCCESS) { continue; } nvmlDevice_t remoteDevice; - auto const result = nvmlDeviceGetHandleByPciBusId_v2(remotePciInfo.busId, &remoteDevice); + auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remotePciInfo.busId, &remoteDevice); if (result == NVML_SUCCESS) { // Two GPUs are connected directly through nvlink unsigned int remoteDeviceId; - NVML_CHECK(nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId)); + NVML_CHECK(nvml->nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId)); if (remoteDeviceId == static_cast(secondDeviceId)) { @@ -770,12 +761,12 @@ void AllreducePlugin::setGroupTopology() noexcept // now remotePciInfo represents the pci information of nvswitch, // determine whether nvlink is supported by whether two GPUs are connected to the same nvswitch. nvmlDevice_t secondDevice; - NVML_CHECK(nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice)); + NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice)); for (unsigned int secondLink = 0; secondLink < NVML_NVLINK_MAX_LINKS; secondLink++) { nvmlPciInfo_t secondRemotePciInfo; - if (nvmlDeviceGetNvLinkRemotePciInfo_v2(secondDevice, secondLink, &secondRemotePciInfo) + if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(secondDevice, secondLink, &secondRemotePciInfo) != NVML_SUCCESS) { continue; diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt index c681e08bdf..ca81fbb0f6 100644 --- a/cpp/tensorrt_llm/runtime/CMakeLists.txt +++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt @@ -81,7 +81,6 @@ set_property(TARGET runtime_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) add_cuda_architectures(runtime_src 89) target_include_directories(runtime_src PRIVATE ${MPI_C_INCLUDE_DIRS}) -target_link_libraries(runtime_src PUBLIC ${CUDA_NVML_LIB}) if(ENABLE_MULTI_DEVICE) target_link_libraries(runtime_src PUBLIC ${NCCL_LIB}) diff --git a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu index 345930ab37..05b0071673 100644 --- a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu +++ b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu @@ -23,7 +23,7 @@ #include #endif #if ENABLE_MULTI_DEVICE -#include +#include "tensorrt_llm/common/nvmlWrapper.h" #endif #include @@ -46,7 +46,8 @@ nvmlReturn_t retval = cmd; \ if (retval != NVML_SUCCESS) \ { \ - printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(retval)); \ + printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, \ + tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(retval)); \ exit(EXIT_FAILURE); \ } \ } while (0) @@ -329,18 +330,41 @@ private: return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; } + auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance(); + tensorrt_llm::common::NvmlManager nvmlManager; + nvmlDevice_t nvml_device; - nvmlGpuFabricInfo_t fabric_info; - NVMLCHECK(nvmlInit_v2()); - NVMLCHECK(nvmlDeviceGetHandleByIndex(device_id, &nvml_device)); - NVMLCHECK(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info)); - NVMLCHECK(nvmlShutdown()); + NVMLCHECK(nvml->nvmlDeviceGetHandleByIndex(device_id, &nvml_device)); + + nvmlGpuFabricState_t fabric_state; + nvmlReturn_t fabric_status; + if (nvml->hasGpuFabricInfoV()) + { + nvmlGpuFabricInfoV_t fabric_info_v; + memset(&fabric_info_v, 0, sizeof(fabric_info_v)); + fabric_info_v.version = nvmlGpuFabricInfo_v2; + NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfoV(nvml_device, &fabric_info_v)); + fabric_state = fabric_info_v.state; + fabric_status = fabric_info_v.status; + } + else if (nvml->hasGpuFabricInfo()) + { + nvmlGpuFabricInfo_t fabric_info; + NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info)); + fabric_state = fabric_info.state; + fabric_status = fabric_info.status; + } + else + { + TLLM_LOG_TRACE("checking fabric support... NVML fabric info APIs not available."); + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + } // Check if the fabric is fully initialized. - if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS) + if (fabric_state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_status != NVML_SUCCESS) { - TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.", - fabric_info.state, fabric_info.status); + TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.", fabric_state, + fabric_status); return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; } @@ -381,8 +405,7 @@ private: return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; } - TLLM_LOG_TRACE("fabric status: state=%u status=%u clique=%u", device_id, fabric_info.state, fabric_info.status, - fabric_info.cliqueId); + TLLM_LOG_TRACE("fabric status: state=%u status=%u", device_id, fabric_state, fabric_status); CUCHECK(cuMemRelease(handle)); // If we get here, fabric handles are supported. diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt index 08c7baf9c6..367d3c5f86 100644 --- a/cpp/tensorrt_llm/thop/CMakeLists.txt +++ b/cpp/tensorrt_llm/thop/CMakeLists.txt @@ -134,8 +134,7 @@ endif() if(ENABLE_MULTI_DEVICE) target_include_directories(th_common PUBLIC ${MPI_C_INCLUDE_DIRS}) - target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB} - CUDA::nvml) + target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB}) endif() if(NOT WIN32) diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp index b5dc61d3a0..517d0eb2c3 100644 --- a/cpp/tensorrt_llm/thop/allreduceOp.cpp +++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp @@ -21,6 +21,7 @@ #include "tensorrt_llm/common/dataType.h" #include "tensorrt_llm/common/mcastDevMemUtils.h" #include "tensorrt_llm/common/ncclUtils.h" +#include "tensorrt_llm/common/nvmlWrapper.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h" @@ -85,19 +86,8 @@ struct overloaded : Ts... template overloaded(Ts...) -> overloaded; -class NvmlManager -{ -public: - NvmlManager() - { - NVML_CHECK_THROW(nvmlInit()); - } - - ~NvmlManager() - { - NVML_CHECK(nvmlShutdown()); - } -}; +using tensorrt_llm::common::NvmlManager; +using tensorrt_llm::common::NVMLWrapper; std::set getLocalGroup(std::set const& group) { @@ -965,7 +955,7 @@ private: MNNVLFabricInfo info; #if ENABLE_MULTI_DEVICE - // 1. Check CUDA driver version (needs >= 12.0.10) + // Check CUDA driver version (needs >= 12.0.10) int cudaDriverVersion = -1; TLLM_CUDA_CHECK(cudaDriverGetVersion(&cudaDriverVersion)); if (cudaDriverVersion < 12010) @@ -974,7 +964,7 @@ private: return info; } - // 2. Check multicast support + // Check multicast support CUdevice cuDevice; TLLM_CU_CHECK(cuDeviceGet(&cuDevice, deviceId)); auto cudaDriver = tensorrt_llm::common::CUDADriverWrapper::getInstance(); @@ -988,7 +978,7 @@ private: return info; } - // 3. Check fabric handle support + // Check fabric handle support int fabricHandleSupported = 0; TLLM_CU_CHECK(cudaDriver->cuDeviceGetAttribute( &fabricHandleSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDevice)); @@ -998,9 +988,10 @@ private: return info; } - // 4. Check NVML GPU Fabric Info using versioned API + // Check NVML GPU Fabric Info using versioned API (runtime dispatch) + auto nvml = NVMLWrapper::getInstance(); nvmlDevice_t nvmlDevice; - nvmlReturn_t nvmlResult = nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice); + nvmlReturn_t nvmlResult = nvml->nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice); if (nvmlResult != NVML_SUCCESS) { TLLM_LOG_DEBUG("MNNVL check: Failed to get NVML device handle for device %d - error=%d", deviceId, @@ -1008,24 +999,48 @@ private: return info; } - nvmlGpuFabricInfoV_t fabricInfoV; - std::memset(&fabricInfoV, 0, sizeof(fabricInfoV)); - fabricInfoV.version = NVML_STRUCT_VERSION(GpuFabricInfo, 3); - nvmlResult = nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV); + nvmlGpuFabricState_t fabricState; + nvmlReturn_t fabricStatus; + unsigned char fabricClusterUuid[NVML_GPU_FABRIC_UUID_LEN]; + unsigned int fabricCliqueId; + if (nvml->hasGpuFabricInfoV()) + { + nvmlGpuFabricInfoV_t fabricInfoV; + std::memset(&fabricInfoV, 0, sizeof(fabricInfoV)); + fabricInfoV.version = nvmlGpuFabricInfo_v2; + nvmlResult = nvml->nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV); + fabricState = fabricInfoV.state; + fabricStatus = fabricInfoV.status; + std::memcpy(fabricClusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN); + fabricCliqueId = fabricInfoV.cliqueId; + } + else if (nvml->hasGpuFabricInfo()) + { + nvmlGpuFabricInfo_t fabricInfoLegacy; + nvmlResult = nvml->nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfoLegacy); + fabricState = fabricInfoLegacy.state; + fabricStatus = fabricInfoLegacy.status; + std::memcpy(fabricClusterUuid, fabricInfoLegacy.clusterUuid, NVML_GPU_FABRIC_UUID_LEN); + fabricCliqueId = fabricInfoLegacy.cliqueId; + } + else + { + TLLM_LOG_DEBUG("MNNVL check: Neither nvmlDeviceGetGpuFabricInfoV nor nvmlDeviceGetGpuFabricInfo available"); + return info; + } if (nvmlResult != NVML_SUCCESS) { TLLM_LOG_DEBUG( - "MNNVL check: nvmlDeviceGetGpuFabricInfoV failed for device %d - error=%d (not supported or " + "MNNVL check: nvmlDeviceGetGpuFabricInfo failed for device %d - error=%d (not supported or " "no fabric manager)", deviceId, static_cast(nvmlResult)); return info; } // Check if fabric is fully initialized - if (fabricInfoV.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfoV.status != NVML_SUCCESS) + if (fabricState != NVML_GPU_FABRIC_STATE_COMPLETED || fabricStatus != NVML_SUCCESS) { - TLLM_LOG_DEBUG( - "MNNVL check: Fabric state not complete - state=%u status=%u", fabricInfoV.state, fabricInfoV.status); + TLLM_LOG_DEBUG("MNNVL check: Fabric state not complete - state=%u status=%u", fabricState, fabricStatus); return info; } @@ -1034,7 +1049,7 @@ private: bool clusterUuidValid = false; for (int i = 0; i < NVML_GPU_FABRIC_UUID_LEN; ++i) { - if (fabricInfoV.clusterUuid[i] != 0) + if (fabricClusterUuid[i] != 0) { clusterUuidValid = true; break; @@ -1047,7 +1062,7 @@ private: return info; } - // 5. Check NVLink links are active (similar to Python support_nvlink(True)) + // Check NVLink links are active (similar to Python support_nvlink(True)) unsigned int activeLinks = 0; unsigned int availableLinks = 0; @@ -1055,12 +1070,12 @@ private: { unsigned int capP2p = 0; nvmlReturn_t capResult - = nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p); + = nvml->nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p); if (capResult == NVML_SUCCESS && capP2p) { availableLinks++; nvmlEnableState_t linkState; - if (nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS + if (nvml->nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS && linkState == NVML_FEATURE_ENABLED) { activeLinks++; @@ -1077,12 +1092,12 @@ private: } // Device supports MNNVL - copy fabric info - std::memcpy(info.clusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN); - info.cliqueId = fabricInfoV.cliqueId; + std::memcpy(info.clusterUuid, fabricClusterUuid, NVML_GPU_FABRIC_UUID_LEN); + info.cliqueId = fabricCliqueId; info.isValid = true; TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (clusterUuid=%s, cliqueId=%u)", deviceId, - info.getClusterUuidString().c_str(), fabricInfoV.cliqueId); + info.getClusterUuidString().c_str(), fabricCliqueId); #endif return info; } @@ -1104,6 +1119,7 @@ private: bool is_inter_node = (mGroup.size() != local_group.size()); NvmlManager nvml_manager; + auto const& nvml = nvml_manager.sharedWrapper(); mIsP2PSupported = true; mIsNVLINKSupported = true; mIsMNNVLSupported = false; @@ -1134,26 +1150,27 @@ private: } nvmlDevice_t first_device; - NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device)); + NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(first_device_id, &first_device)); bool is_NVLINK = false; for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++) { nvmlPciInfo_t remote_pci_info; - if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS) + if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(first_device, link, &remote_pci_info) + != NVML_SUCCESS) { continue; } nvmlDevice_t remote_device; - auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device); + auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remote_pci_info.busId, &remote_device); if (result == NVML_SUCCESS) { // Two GPUs are connected directly through nvlink unsigned int remote_device_id; - NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id)); + NVML_CHECK_THROW(nvml->nvmlDeviceGetIndex(remote_device, &remote_device_id)); if (remote_device_id == static_cast(second_device_id)) { @@ -1167,12 +1184,12 @@ private: // determine whether nvlink is supported by whether two GPUs are connected to the same // nvswitch. nvmlDevice_t second_device; - NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device)); + NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(second_device_id, &second_device)); for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++) { nvmlPciInfo_t second_remote_pci_info; - if (nvmlDeviceGetNvLinkRemotePciInfo_v2( + if (nvml->nvmlDeviceGetNvLinkRemotePciInfo( second_device, second_link, &second_remote_pci_info) != NVML_SUCCESS) { diff --git a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp index a4a6e55e85..85d49433d8 100644 --- a/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp +++ b/cpp/tests/unit_tests/runtime/virtualMemoryTest.cpp @@ -18,6 +18,7 @@ #include #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/nvmlWrapper.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/tllmBuffers.h" #include "tensorrt_llm/runtime/virtualMemory.h" @@ -57,9 +58,11 @@ protected: TLLM_CU_CHECK(cuDevicePrimaryCtxRetain(&ctx, dev)); TLLM_CU_CHECK(cuCtxSetCurrent(ctx)); - // Initialize NVML - nvmlReturn_t nvmlResult = nvmlInit(); - TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", nvmlErrorString(nvmlResult)); + // Initialize NVML via wrapper + mNvml = tensorrt_llm::common::NVMLWrapper::getInstance(); + nvmlReturn_t nvmlResult = mNvml->nvmlInit(); + TLLM_CHECK_WITH_INFO( + nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", mNvml->nvmlErrorString(nvmlResult)); if (!memoryInfoAvailable()) { @@ -88,14 +91,16 @@ protected: static size_t getCurrentProcessMemoryInfo() { + auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance(); + // Get current process ID uint32_t currentPid = static_cast(getpid()); // Get device handle for GPU 0 nvmlDevice_t device; - auto nvmlResult = nvmlDeviceGetHandleByIndex(0, &device); + auto nvmlResult = nvml->nvmlDeviceGetHandleByIndex(0, &device); TLLM_CHECK_WITH_INFO( - nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvmlErrorString(nvmlResult)); + nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvml->nvmlErrorString(nvmlResult)); // Get running processes unsigned int processCount = 1; @@ -103,9 +108,9 @@ protected: nvmlResult = NVML_ERROR_INSUFFICIENT_SIZE; while (nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE) { - nvmlResult = nvmlDeviceGetComputeRunningProcesses_v3(device, &processCount, processes.data()); + nvmlResult = nvml->nvmlDeviceGetComputeRunningProcesses(device, &processCount, processes.data()); TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS || nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE, - "Failed to get process count: %s", nvmlErrorString(nvmlResult)); + "Failed to get process count: %s", nvml->nvmlErrorString(nvmlResult)); processes.resize(processCount); } @@ -120,6 +125,8 @@ protected: return 0; } + + std::shared_ptr mNvml; }; class VirtualMemoryTest : public VirtualMemoryTestBase