mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 07:53:55 +08:00
[https://nvbugs/5887893][fix] Make NVML work with older CUDA driver versions (#11465)
Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
This commit is contained in:
parent
5130cbd73e
commit
dd74f90914
184
cpp/tensorrt_llm/common/nvmlWrapper.cpp
Normal file
184
cpp/tensorrt_llm/common/nvmlWrapper.cpp
Normal file
@ -0,0 +1,184 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
#include "tensorrt_llm/common/logger.h"
|
||||
#include "tensorrt_llm/common/nvmlWrapper.h"
|
||||
|
||||
#include <mutex>
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
std::shared_ptr<NVMLWrapper> NVMLWrapper::getInstance()
|
||||
{
|
||||
static std::mutex mutex;
|
||||
static std::weak_ptr<NVMLWrapper> instance;
|
||||
std::shared_ptr<NVMLWrapper> result = instance.lock();
|
||||
if (result)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> const lock(mutex);
|
||||
result = instance.lock();
|
||||
if (!result)
|
||||
{
|
||||
result = std::shared_ptr<NVMLWrapper>(new NVMLWrapper());
|
||||
instance = result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
NVMLWrapper::NVMLWrapper()
|
||||
: mHandle(dlopen("libnvidia-ml.so.1", RTLD_LAZY))
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(mHandle != nullptr, "NVML library (libnvidia-ml.so.1) could not be loaded.");
|
||||
|
||||
auto loadSym = [](void* handle, char const* name) -> void* { return dlsym(handle, name); };
|
||||
|
||||
auto loadRequired = [&](void* handle, char const* name) -> void*
|
||||
{
|
||||
void* sym = loadSym(handle, name);
|
||||
TLLM_CHECK_WITH_INFO(sym != nullptr, "Required NVML symbol not found: %s", name);
|
||||
return sym;
|
||||
};
|
||||
|
||||
*reinterpret_cast<void**>(&_nvmlInit) = loadRequired(mHandle, "nvmlInit_v2");
|
||||
*reinterpret_cast<void**>(&_nvmlShutdown) = loadRequired(mHandle, "nvmlShutdown");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetHandleByIndex) = loadRequired(mHandle, "nvmlDeviceGetHandleByIndex_v2");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetHandleByPciBusId)
|
||||
= loadRequired(mHandle, "nvmlDeviceGetHandleByPciBusId_v2");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetIndex) = loadRequired(mHandle, "nvmlDeviceGetIndex");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkRemotePciInfo)
|
||||
= loadRequired(mHandle, "nvmlDeviceGetNvLinkRemotePciInfo_v2");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkCapability) = loadRequired(mHandle, "nvmlDeviceGetNvLinkCapability");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetNvLinkState) = loadRequired(mHandle, "nvmlDeviceGetNvLinkState");
|
||||
*reinterpret_cast<void**>(&_nvmlErrorString) = loadRequired(mHandle, "nvmlErrorString");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetComputeRunningProcesses)
|
||||
= loadRequired(mHandle, "nvmlDeviceGetComputeRunningProcesses_v3");
|
||||
|
||||
// Optional symbols - nullptr is OK (older drivers may not have these)
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfoV) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfoV");
|
||||
*reinterpret_cast<void**>(&_nvmlDeviceGetGpuFabricInfo) = loadSym(mHandle, "nvmlDeviceGetGpuFabricInfo");
|
||||
|
||||
if (!_nvmlDeviceGetGpuFabricInfoV)
|
||||
{
|
||||
TLLM_LOG_INFO(
|
||||
"NVML symbol nvmlDeviceGetGpuFabricInfoV not available (older driver). MNNVL fabric detection will use "
|
||||
"legacy API or be disabled.");
|
||||
}
|
||||
if (!_nvmlDeviceGetGpuFabricInfo)
|
||||
{
|
||||
TLLM_LOG_INFO("NVML symbol nvmlDeviceGetGpuFabricInfo not available.");
|
||||
}
|
||||
}
|
||||
|
||||
NVMLWrapper::~NVMLWrapper()
|
||||
{
|
||||
dlclose(mHandle);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlInit() const
|
||||
{
|
||||
return (*_nvmlInit)();
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlShutdown() const
|
||||
{
|
||||
return (*_nvmlShutdown)();
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const
|
||||
{
|
||||
return (*_nvmlDeviceGetHandleByIndex)(index, device);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const
|
||||
{
|
||||
return (*_nvmlDeviceGetHandleByPciBusId)(pciBusId, device);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const
|
||||
{
|
||||
return (*_nvmlDeviceGetIndex)(device, index);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkRemotePciInfo(
|
||||
nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const
|
||||
{
|
||||
return (*_nvmlDeviceGetNvLinkRemotePciInfo)(device, link, pci);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkCapability(
|
||||
nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const
|
||||
{
|
||||
return (*_nvmlDeviceGetNvLinkCapability)(device, link, capability, capResult);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetNvLinkState(
|
||||
nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const
|
||||
{
|
||||
return (*_nvmlDeviceGetNvLinkState)(device, link, isActive);
|
||||
}
|
||||
|
||||
char const* NVMLWrapper::nvmlErrorString(nvmlReturn_t result) const
|
||||
{
|
||||
return (*_nvmlErrorString)(result);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const
|
||||
{
|
||||
if (!_nvmlDeviceGetGpuFabricInfoV)
|
||||
{
|
||||
return NVML_ERROR_FUNCTION_NOT_FOUND;
|
||||
}
|
||||
return (*_nvmlDeviceGetGpuFabricInfoV)(device, gpuFabricInfo);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const
|
||||
{
|
||||
if (!_nvmlDeviceGetGpuFabricInfo)
|
||||
{
|
||||
return NVML_ERROR_FUNCTION_NOT_FOUND;
|
||||
}
|
||||
return (*_nvmlDeviceGetGpuFabricInfo)(device, gpuFabricInfo);
|
||||
}
|
||||
|
||||
nvmlReturn_t NVMLWrapper::nvmlDeviceGetComputeRunningProcesses(
|
||||
nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const
|
||||
{
|
||||
return (*_nvmlDeviceGetComputeRunningProcesses)(device, infoCount, infos);
|
||||
}
|
||||
|
||||
bool NVMLWrapper::hasGpuFabricInfoV() const
|
||||
{
|
||||
return _nvmlDeviceGetGpuFabricInfoV != nullptr;
|
||||
}
|
||||
|
||||
bool NVMLWrapper::hasGpuFabricInfo() const
|
||||
{
|
||||
return _nvmlDeviceGetGpuFabricInfo != nullptr;
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
123
cpp/tensorrt_llm/common/nvmlWrapper.h
Normal file
123
cpp/tensorrt_llm/common/nvmlWrapper.h
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef NVML_WRAPPER_H
|
||||
#define NVML_WRAPPER_H
|
||||
|
||||
#include "tensorrt_llm/common/config.h"
|
||||
|
||||
#include <nvml.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
class NVMLWrapper
|
||||
{
|
||||
public:
|
||||
static std::shared_ptr<NVMLWrapper> getInstance();
|
||||
|
||||
~NVMLWrapper();
|
||||
NVMLWrapper(NVMLWrapper const&) = delete;
|
||||
NVMLWrapper& operator=(NVMLWrapper const&) = delete;
|
||||
NVMLWrapper(NVMLWrapper&&) = delete;
|
||||
NVMLWrapper& operator=(NVMLWrapper&&) = delete;
|
||||
|
||||
// Required NVML functions
|
||||
nvmlReturn_t nvmlInit() const;
|
||||
nvmlReturn_t nvmlShutdown() const;
|
||||
nvmlReturn_t nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) const;
|
||||
nvmlReturn_t nvmlDeviceGetHandleByPciBusId(char const* pciBusId, nvmlDevice_t* device) const;
|
||||
nvmlReturn_t nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int* index) const;
|
||||
nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) const;
|
||||
nvmlReturn_t nvmlDeviceGetNvLinkCapability(
|
||||
nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) const;
|
||||
nvmlReturn_t nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) const;
|
||||
char const* nvmlErrorString(nvmlReturn_t result) const;
|
||||
nvmlReturn_t nvmlDeviceGetComputeRunningProcesses(
|
||||
nvmlDevice_t device, unsigned int* infoCount, nvmlProcessInfo_v2_t* infos) const;
|
||||
|
||||
// Optional NVML functions (may be nullptr on older drivers)
|
||||
nvmlReturn_t nvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t* gpuFabricInfo) const;
|
||||
nvmlReturn_t nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t* gpuFabricInfo) const;
|
||||
|
||||
// Runtime availability checks
|
||||
bool hasGpuFabricInfoV() const;
|
||||
bool hasGpuFabricInfo() const;
|
||||
|
||||
private:
|
||||
void* mHandle;
|
||||
NVMLWrapper();
|
||||
|
||||
// Required function pointers
|
||||
nvmlReturn_t (*_nvmlInit)();
|
||||
nvmlReturn_t (*_nvmlShutdown)();
|
||||
nvmlReturn_t (*_nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t*);
|
||||
nvmlReturn_t (*_nvmlDeviceGetHandleByPciBusId)(char const*, nvmlDevice_t*);
|
||||
nvmlReturn_t (*_nvmlDeviceGetIndex)(nvmlDevice_t, unsigned int*);
|
||||
nvmlReturn_t (*_nvmlDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t, unsigned int, nvmlPciInfo_t*);
|
||||
nvmlReturn_t (*_nvmlDeviceGetNvLinkCapability)(nvmlDevice_t, unsigned int, nvmlNvLinkCapability_t, unsigned int*);
|
||||
nvmlReturn_t (*_nvmlDeviceGetNvLinkState)(nvmlDevice_t, unsigned int, nvmlEnableState_t*);
|
||||
char const* (*_nvmlErrorString)(nvmlReturn_t);
|
||||
nvmlReturn_t (*_nvmlDeviceGetComputeRunningProcesses)(nvmlDevice_t, unsigned int*, nvmlProcessInfo_v2_t*);
|
||||
|
||||
// Optional function pointers (may be nullptr)
|
||||
nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfoV)(nvmlDevice_t, nvmlGpuFabricInfoV_t*);
|
||||
nvmlReturn_t (*_nvmlDeviceGetGpuFabricInfo)(nvmlDevice_t, nvmlGpuFabricInfo_t*);
|
||||
};
|
||||
|
||||
// RAII class that initializes NVML on construction and shuts down on destruction.
|
||||
// Replaces duplicated NvmlManager classes in allreduceOp.cpp and allreducePlugin.cpp.
|
||||
class NvmlManager
|
||||
{
|
||||
public:
|
||||
NvmlManager()
|
||||
: mNvml(NVMLWrapper::getInstance())
|
||||
{
|
||||
auto result = mNvml->nvmlInit();
|
||||
if (result != NVML_SUCCESS)
|
||||
{
|
||||
TLLM_THROW("Failed to initialize NVML: %s", mNvml->nvmlErrorString(result));
|
||||
}
|
||||
}
|
||||
|
||||
~NvmlManager()
|
||||
{
|
||||
mNvml->nvmlShutdown();
|
||||
}
|
||||
|
||||
NVMLWrapper const& wrapper() const
|
||||
{
|
||||
return *mNvml;
|
||||
}
|
||||
|
||||
std::shared_ptr<NVMLWrapper> const& sharedWrapper() const
|
||||
{
|
||||
return mNvml;
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<NVMLWrapper> mNvml;
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
|
||||
TRTLLM_NAMESPACE_END
|
||||
|
||||
#endif // NVML_WRAPPER_H
|
||||
@ -38,6 +38,8 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "tensorrt_llm/common/nvmlWrapper.h"
|
||||
|
||||
TRTLLM_NAMESPACE_BEGIN
|
||||
|
||||
namespace common::op
|
||||
@ -319,7 +321,8 @@ TRTLLM_NAMESPACE_END
|
||||
nvmlReturn_t r = cmd; \
|
||||
if (r != NVML_SUCCESS) \
|
||||
{ \
|
||||
printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \
|
||||
printf("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, \
|
||||
tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
@ -330,6 +333,7 @@ TRTLLM_NAMESPACE_END
|
||||
nvmlReturn_t r = cmd; \
|
||||
if (TLLM_UNLIKELY(r != NVML_SUCCESS)) \
|
||||
{ \
|
||||
TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(r)); \
|
||||
TLLM_THROW("Failed, NVML error %s:%d '%s'\n", __FILE__, __LINE__, \
|
||||
tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(r)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@ -53,7 +53,6 @@ target_link_libraries(
|
||||
${TORCH_LIBRARIES}
|
||||
torch_python
|
||||
${CUDA_DRV_LIB}
|
||||
${CUDA_NVML_LIB}
|
||||
th_common
|
||||
pg_utils)
|
||||
target_compile_definitions(
|
||||
|
||||
@ -170,7 +170,6 @@ target_link_libraries(
|
||||
${CUBLASLT_LIB}
|
||||
${TRT_LIB}
|
||||
${CUDA_DRV_LIB}
|
||||
${CUDA_NVML_LIB}
|
||||
${CUDA_RT_LIB}
|
||||
${CMAKE_DL_LIBS}
|
||||
${SHARED_TARGET})
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/customAllReduceUtils.h"
|
||||
#include "tensorrt_llm/common/dataType.h"
|
||||
#include "tensorrt_llm/common/nvmlWrapper.h"
|
||||
#include "tensorrt_llm/kernels/customAllReduceKernels.h"
|
||||
#include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
|
||||
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
|
||||
@ -601,19 +602,8 @@ bool AllreducePlugin::isCustomAllReduceSupported(int ranks_per_node) const noexc
|
||||
&& (static_cast<size_t>(ranks_per_node) <= kernels::MAX_RANKS_PER_NODE) && (ranks_per_node > 0);
|
||||
}
|
||||
|
||||
class NvmlManager
|
||||
{
|
||||
public:
|
||||
NvmlManager()
|
||||
{
|
||||
NVML_CHECK(nvmlInit());
|
||||
}
|
||||
|
||||
~NvmlManager()
|
||||
{
|
||||
NVML_CHECK(nvmlShutdown());
|
||||
}
|
||||
};
|
||||
using tensorrt_llm::common::NvmlManager;
|
||||
using tensorrt_llm::common::NVMLWrapper;
|
||||
|
||||
std::set<int> getLocalGroup(std::set<int> const& group)
|
||||
{
|
||||
@ -711,6 +701,7 @@ void AllreducePlugin::setGroupTopology() noexcept
|
||||
TLLM_LOG_INFO("TP group is intra-node for rank %d", rank);
|
||||
|
||||
NvmlManager nvmlManager;
|
||||
auto const& nvml = nvmlManager.sharedWrapper();
|
||||
std::unordered_set<int> visitedDevice;
|
||||
mIsP2PSupported = true;
|
||||
mIsNVLINKSupported = true;
|
||||
@ -738,26 +729,26 @@ void AllreducePlugin::setGroupTopology() noexcept
|
||||
}
|
||||
|
||||
nvmlDevice_t firstDevice;
|
||||
NVML_CHECK(nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));
|
||||
NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(firstDeviceId, &firstDevice));
|
||||
|
||||
bool isNVLINK = false;
|
||||
|
||||
for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
|
||||
{
|
||||
nvmlPciInfo_t remotePciInfo;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
|
||||
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(firstDevice, link, &remotePciInfo) != NVML_SUCCESS)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
nvmlDevice_t remoteDevice;
|
||||
auto const result = nvmlDeviceGetHandleByPciBusId_v2(remotePciInfo.busId, &remoteDevice);
|
||||
auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remotePciInfo.busId, &remoteDevice);
|
||||
|
||||
if (result == NVML_SUCCESS)
|
||||
{
|
||||
// Two GPUs are connected directly through nvlink
|
||||
unsigned int remoteDeviceId;
|
||||
NVML_CHECK(nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));
|
||||
NVML_CHECK(nvml->nvmlDeviceGetIndex(remoteDevice, &remoteDeviceId));
|
||||
|
||||
if (remoteDeviceId == static_cast<unsigned int>(secondDeviceId))
|
||||
{
|
||||
@ -770,12 +761,12 @@ void AllreducePlugin::setGroupTopology() noexcept
|
||||
// now remotePciInfo represents the pci information of nvswitch,
|
||||
// determine whether nvlink is supported by whether two GPUs are connected to the same nvswitch.
|
||||
nvmlDevice_t secondDevice;
|
||||
NVML_CHECK(nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));
|
||||
NVML_CHECK(nvml->nvmlDeviceGetHandleByIndex(secondDeviceId, &secondDevice));
|
||||
|
||||
for (unsigned int secondLink = 0; secondLink < NVML_NVLINK_MAX_LINKS; secondLink++)
|
||||
{
|
||||
nvmlPciInfo_t secondRemotePciInfo;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(secondDevice, secondLink, &secondRemotePciInfo)
|
||||
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(secondDevice, secondLink, &secondRemotePciInfo)
|
||||
!= NVML_SUCCESS)
|
||||
{
|
||||
continue;
|
||||
|
||||
@ -81,7 +81,6 @@ set_property(TARGET runtime_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
add_cuda_architectures(runtime_src 89)
|
||||
|
||||
target_include_directories(runtime_src PRIVATE ${MPI_C_INCLUDE_DIRS})
|
||||
target_link_libraries(runtime_src PUBLIC ${CUDA_NVML_LIB})
|
||||
|
||||
if(ENABLE_MULTI_DEVICE)
|
||||
target_link_libraries(runtime_src PUBLIC ${NCCL_LIB})
|
||||
|
||||
@ -23,7 +23,7 @@
|
||||
#include <nvshmem/nvshmemx.h>
|
||||
#endif
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
#include <nvml.h>
|
||||
#include "tensorrt_llm/common/nvmlWrapper.h"
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
|
||||
@ -46,7 +46,8 @@
|
||||
nvmlReturn_t retval = cmd; \
|
||||
if (retval != NVML_SUCCESS) \
|
||||
{ \
|
||||
printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, nvmlErrorString(retval)); \
|
||||
printf("Failed: NVML error %s:%d '%s'\n", __FILE__, __LINE__, \
|
||||
tensorrt_llm::common::NVMLWrapper::getInstance()->nvmlErrorString(retval)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
@ -329,18 +330,41 @@ private:
|
||||
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
}
|
||||
|
||||
auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
|
||||
tensorrt_llm::common::NvmlManager nvmlManager;
|
||||
|
||||
nvmlDevice_t nvml_device;
|
||||
nvmlGpuFabricInfo_t fabric_info;
|
||||
NVMLCHECK(nvmlInit_v2());
|
||||
NVMLCHECK(nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
|
||||
NVMLCHECK(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
|
||||
NVMLCHECK(nvmlShutdown());
|
||||
NVMLCHECK(nvml->nvmlDeviceGetHandleByIndex(device_id, &nvml_device));
|
||||
|
||||
nvmlGpuFabricState_t fabric_state;
|
||||
nvmlReturn_t fabric_status;
|
||||
if (nvml->hasGpuFabricInfoV())
|
||||
{
|
||||
nvmlGpuFabricInfoV_t fabric_info_v;
|
||||
memset(&fabric_info_v, 0, sizeof(fabric_info_v));
|
||||
fabric_info_v.version = nvmlGpuFabricInfo_v2;
|
||||
NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfoV(nvml_device, &fabric_info_v));
|
||||
fabric_state = fabric_info_v.state;
|
||||
fabric_status = fabric_info_v.status;
|
||||
}
|
||||
else if (nvml->hasGpuFabricInfo())
|
||||
{
|
||||
nvmlGpuFabricInfo_t fabric_info;
|
||||
NVMLCHECK(nvml->nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info));
|
||||
fabric_state = fabric_info.state;
|
||||
fabric_status = fabric_info.status;
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_LOG_TRACE("checking fabric support... NVML fabric info APIs not available.");
|
||||
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
}
|
||||
|
||||
// Check if the fabric is fully initialized.
|
||||
if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS)
|
||||
if (fabric_state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_status != NVML_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.",
|
||||
fabric_info.state, fabric_info.status);
|
||||
TLLM_LOG_TRACE("checking fabric support... fabric state is NOT COMPLETE: state=%u status=%u.", fabric_state,
|
||||
fabric_status);
|
||||
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
}
|
||||
|
||||
@ -381,8 +405,7 @@ private:
|
||||
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
|
||||
}
|
||||
|
||||
TLLM_LOG_TRACE("fabric status: state=%u status=%u clique=%u", device_id, fabric_info.state, fabric_info.status,
|
||||
fabric_info.cliqueId);
|
||||
TLLM_LOG_TRACE("fabric status: state=%u status=%u", device_id, fabric_state, fabric_status);
|
||||
|
||||
CUCHECK(cuMemRelease(handle));
|
||||
// If we get here, fabric handles are supported.
|
||||
|
||||
@ -134,8 +134,7 @@ endif()
|
||||
|
||||
if(ENABLE_MULTI_DEVICE)
|
||||
target_include_directories(th_common PUBLIC ${MPI_C_INCLUDE_DIRS})
|
||||
target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB}
|
||||
CUDA::nvml)
|
||||
target_link_libraries(th_common PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIB})
|
||||
endif()
|
||||
|
||||
if(NOT WIN32)
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include "tensorrt_llm/common/dataType.h"
|
||||
#include "tensorrt_llm/common/mcastDevMemUtils.h"
|
||||
#include "tensorrt_llm/common/ncclUtils.h"
|
||||
#include "tensorrt_llm/common/nvmlWrapper.h"
|
||||
#include "tensorrt_llm/common/opUtils.h"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h"
|
||||
#include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
|
||||
@ -85,19 +86,8 @@ struct overloaded : Ts...
|
||||
template <class... Ts>
|
||||
overloaded(Ts...) -> overloaded<Ts...>;
|
||||
|
||||
class NvmlManager
|
||||
{
|
||||
public:
|
||||
NvmlManager()
|
||||
{
|
||||
NVML_CHECK_THROW(nvmlInit());
|
||||
}
|
||||
|
||||
~NvmlManager()
|
||||
{
|
||||
NVML_CHECK(nvmlShutdown());
|
||||
}
|
||||
};
|
||||
using tensorrt_llm::common::NvmlManager;
|
||||
using tensorrt_llm::common::NVMLWrapper;
|
||||
|
||||
std::set<int> getLocalGroup(std::set<int> const& group)
|
||||
{
|
||||
@ -965,7 +955,7 @@ private:
|
||||
MNNVLFabricInfo info;
|
||||
|
||||
#if ENABLE_MULTI_DEVICE
|
||||
// 1. Check CUDA driver version (needs >= 12.0.10)
|
||||
// Check CUDA driver version (needs >= 12.0.10)
|
||||
int cudaDriverVersion = -1;
|
||||
TLLM_CUDA_CHECK(cudaDriverGetVersion(&cudaDriverVersion));
|
||||
if (cudaDriverVersion < 12010)
|
||||
@ -974,7 +964,7 @@ private:
|
||||
return info;
|
||||
}
|
||||
|
||||
// 2. Check multicast support
|
||||
// Check multicast support
|
||||
CUdevice cuDevice;
|
||||
TLLM_CU_CHECK(cuDeviceGet(&cuDevice, deviceId));
|
||||
auto cudaDriver = tensorrt_llm::common::CUDADriverWrapper::getInstance();
|
||||
@ -988,7 +978,7 @@ private:
|
||||
return info;
|
||||
}
|
||||
|
||||
// 3. Check fabric handle support
|
||||
// Check fabric handle support
|
||||
int fabricHandleSupported = 0;
|
||||
TLLM_CU_CHECK(cudaDriver->cuDeviceGetAttribute(
|
||||
&fabricHandleSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDevice));
|
||||
@ -998,9 +988,10 @@ private:
|
||||
return info;
|
||||
}
|
||||
|
||||
// 4. Check NVML GPU Fabric Info using versioned API
|
||||
// Check NVML GPU Fabric Info using versioned API (runtime dispatch)
|
||||
auto nvml = NVMLWrapper::getInstance();
|
||||
nvmlDevice_t nvmlDevice;
|
||||
nvmlReturn_t nvmlResult = nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
|
||||
nvmlReturn_t nvmlResult = nvml->nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice);
|
||||
if (nvmlResult != NVML_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_DEBUG("MNNVL check: Failed to get NVML device handle for device %d - error=%d", deviceId,
|
||||
@ -1008,24 +999,48 @@ private:
|
||||
return info;
|
||||
}
|
||||
|
||||
nvmlGpuFabricInfoV_t fabricInfoV;
|
||||
std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
|
||||
fabricInfoV.version = NVML_STRUCT_VERSION(GpuFabricInfo, 3);
|
||||
nvmlResult = nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
|
||||
nvmlGpuFabricState_t fabricState;
|
||||
nvmlReturn_t fabricStatus;
|
||||
unsigned char fabricClusterUuid[NVML_GPU_FABRIC_UUID_LEN];
|
||||
unsigned int fabricCliqueId;
|
||||
if (nvml->hasGpuFabricInfoV())
|
||||
{
|
||||
nvmlGpuFabricInfoV_t fabricInfoV;
|
||||
std::memset(&fabricInfoV, 0, sizeof(fabricInfoV));
|
||||
fabricInfoV.version = nvmlGpuFabricInfo_v2;
|
||||
nvmlResult = nvml->nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV);
|
||||
fabricState = fabricInfoV.state;
|
||||
fabricStatus = fabricInfoV.status;
|
||||
std::memcpy(fabricClusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
|
||||
fabricCliqueId = fabricInfoV.cliqueId;
|
||||
}
|
||||
else if (nvml->hasGpuFabricInfo())
|
||||
{
|
||||
nvmlGpuFabricInfo_t fabricInfoLegacy;
|
||||
nvmlResult = nvml->nvmlDeviceGetGpuFabricInfo(nvmlDevice, &fabricInfoLegacy);
|
||||
fabricState = fabricInfoLegacy.state;
|
||||
fabricStatus = fabricInfoLegacy.status;
|
||||
std::memcpy(fabricClusterUuid, fabricInfoLegacy.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
|
||||
fabricCliqueId = fabricInfoLegacy.cliqueId;
|
||||
}
|
||||
else
|
||||
{
|
||||
TLLM_LOG_DEBUG("MNNVL check: Neither nvmlDeviceGetGpuFabricInfoV nor nvmlDeviceGetGpuFabricInfo available");
|
||||
return info;
|
||||
}
|
||||
if (nvmlResult != NVML_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_DEBUG(
|
||||
"MNNVL check: nvmlDeviceGetGpuFabricInfoV failed for device %d - error=%d (not supported or "
|
||||
"MNNVL check: nvmlDeviceGetGpuFabricInfo failed for device %d - error=%d (not supported or "
|
||||
"no fabric manager)",
|
||||
deviceId, static_cast<int>(nvmlResult));
|
||||
return info;
|
||||
}
|
||||
|
||||
// Check if fabric is fully initialized
|
||||
if (fabricInfoV.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfoV.status != NVML_SUCCESS)
|
||||
if (fabricState != NVML_GPU_FABRIC_STATE_COMPLETED || fabricStatus != NVML_SUCCESS)
|
||||
{
|
||||
TLLM_LOG_DEBUG(
|
||||
"MNNVL check: Fabric state not complete - state=%u status=%u", fabricInfoV.state, fabricInfoV.status);
|
||||
TLLM_LOG_DEBUG("MNNVL check: Fabric state not complete - state=%u status=%u", fabricState, fabricStatus);
|
||||
return info;
|
||||
}
|
||||
|
||||
@ -1034,7 +1049,7 @@ private:
|
||||
bool clusterUuidValid = false;
|
||||
for (int i = 0; i < NVML_GPU_FABRIC_UUID_LEN; ++i)
|
||||
{
|
||||
if (fabricInfoV.clusterUuid[i] != 0)
|
||||
if (fabricClusterUuid[i] != 0)
|
||||
{
|
||||
clusterUuidValid = true;
|
||||
break;
|
||||
@ -1047,7 +1062,7 @@ private:
|
||||
return info;
|
||||
}
|
||||
|
||||
// 5. Check NVLink links are active (similar to Python support_nvlink(True))
|
||||
// Check NVLink links are active (similar to Python support_nvlink(True))
|
||||
unsigned int activeLinks = 0;
|
||||
unsigned int availableLinks = 0;
|
||||
|
||||
@ -1055,12 +1070,12 @@ private:
|
||||
{
|
||||
unsigned int capP2p = 0;
|
||||
nvmlReturn_t capResult
|
||||
= nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
|
||||
= nvml->nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p);
|
||||
if (capResult == NVML_SUCCESS && capP2p)
|
||||
{
|
||||
availableLinks++;
|
||||
nvmlEnableState_t linkState;
|
||||
if (nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
|
||||
if (nvml->nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS
|
||||
&& linkState == NVML_FEATURE_ENABLED)
|
||||
{
|
||||
activeLinks++;
|
||||
@ -1077,12 +1092,12 @@ private:
|
||||
}
|
||||
|
||||
// Device supports MNNVL - copy fabric info
|
||||
std::memcpy(info.clusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN);
|
||||
info.cliqueId = fabricInfoV.cliqueId;
|
||||
std::memcpy(info.clusterUuid, fabricClusterUuid, NVML_GPU_FABRIC_UUID_LEN);
|
||||
info.cliqueId = fabricCliqueId;
|
||||
info.isValid = true;
|
||||
|
||||
TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (clusterUuid=%s, cliqueId=%u)", deviceId,
|
||||
info.getClusterUuidString().c_str(), fabricInfoV.cliqueId);
|
||||
info.getClusterUuidString().c_str(), fabricCliqueId);
|
||||
#endif
|
||||
return info;
|
||||
}
|
||||
@ -1104,6 +1119,7 @@ private:
|
||||
bool is_inter_node = (mGroup.size() != local_group.size());
|
||||
|
||||
NvmlManager nvml_manager;
|
||||
auto const& nvml = nvml_manager.sharedWrapper();
|
||||
mIsP2PSupported = true;
|
||||
mIsNVLINKSupported = true;
|
||||
mIsMNNVLSupported = false;
|
||||
@ -1134,26 +1150,27 @@ private:
|
||||
}
|
||||
|
||||
nvmlDevice_t first_device;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
|
||||
NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(first_device_id, &first_device));
|
||||
|
||||
bool is_NVLINK = false;
|
||||
|
||||
for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++)
|
||||
{
|
||||
nvmlPciInfo_t remote_pci_info;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS)
|
||||
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(first_device, link, &remote_pci_info)
|
||||
!= NVML_SUCCESS)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
nvmlDevice_t remote_device;
|
||||
auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device);
|
||||
auto const result = nvml->nvmlDeviceGetHandleByPciBusId(remote_pci_info.busId, &remote_device);
|
||||
|
||||
if (result == NVML_SUCCESS)
|
||||
{
|
||||
// Two GPUs are connected directly through nvlink
|
||||
unsigned int remote_device_id;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id));
|
||||
NVML_CHECK_THROW(nvml->nvmlDeviceGetIndex(remote_device, &remote_device_id));
|
||||
|
||||
if (remote_device_id == static_cast<unsigned int>(second_device_id))
|
||||
{
|
||||
@ -1167,12 +1184,12 @@ private:
|
||||
// determine whether nvlink is supported by whether two GPUs are connected to the same
|
||||
// nvswitch.
|
||||
nvmlDevice_t second_device;
|
||||
NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
|
||||
NVML_CHECK_THROW(nvml->nvmlDeviceGetHandleByIndex(second_device_id, &second_device));
|
||||
|
||||
for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++)
|
||||
{
|
||||
nvmlPciInfo_t second_remote_pci_info;
|
||||
if (nvmlDeviceGetNvLinkRemotePciInfo_v2(
|
||||
if (nvml->nvmlDeviceGetNvLinkRemotePciInfo(
|
||||
second_device, second_link, &second_remote_pci_info)
|
||||
!= NVML_SUCCESS)
|
||||
{
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#include <nvml.h>
|
||||
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/common/nvmlWrapper.h"
|
||||
#include "tensorrt_llm/runtime/bufferManager.h"
|
||||
#include "tensorrt_llm/runtime/tllmBuffers.h"
|
||||
#include "tensorrt_llm/runtime/virtualMemory.h"
|
||||
@ -57,9 +58,11 @@ protected:
|
||||
TLLM_CU_CHECK(cuDevicePrimaryCtxRetain(&ctx, dev));
|
||||
TLLM_CU_CHECK(cuCtxSetCurrent(ctx));
|
||||
|
||||
// Initialize NVML
|
||||
nvmlReturn_t nvmlResult = nvmlInit();
|
||||
TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", nvmlErrorString(nvmlResult));
|
||||
// Initialize NVML via wrapper
|
||||
mNvml = tensorrt_llm::common::NVMLWrapper::getInstance();
|
||||
nvmlReturn_t nvmlResult = mNvml->nvmlInit();
|
||||
TLLM_CHECK_WITH_INFO(
|
||||
nvmlResult == NVML_SUCCESS, "Failed to initialize NVML: %s", mNvml->nvmlErrorString(nvmlResult));
|
||||
|
||||
if (!memoryInfoAvailable())
|
||||
{
|
||||
@ -88,14 +91,16 @@ protected:
|
||||
|
||||
static size_t getCurrentProcessMemoryInfo()
|
||||
{
|
||||
auto nvml = tensorrt_llm::common::NVMLWrapper::getInstance();
|
||||
|
||||
// Get current process ID
|
||||
uint32_t currentPid = static_cast<uint32_t>(getpid());
|
||||
|
||||
// Get device handle for GPU 0
|
||||
nvmlDevice_t device;
|
||||
auto nvmlResult = nvmlDeviceGetHandleByIndex(0, &device);
|
||||
auto nvmlResult = nvml->nvmlDeviceGetHandleByIndex(0, &device);
|
||||
TLLM_CHECK_WITH_INFO(
|
||||
nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvmlErrorString(nvmlResult));
|
||||
nvmlResult == NVML_SUCCESS, "Failed to get device handle: %s", nvml->nvmlErrorString(nvmlResult));
|
||||
|
||||
// Get running processes
|
||||
unsigned int processCount = 1;
|
||||
@ -103,9 +108,9 @@ protected:
|
||||
nvmlResult = NVML_ERROR_INSUFFICIENT_SIZE;
|
||||
while (nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE)
|
||||
{
|
||||
nvmlResult = nvmlDeviceGetComputeRunningProcesses_v3(device, &processCount, processes.data());
|
||||
nvmlResult = nvml->nvmlDeviceGetComputeRunningProcesses(device, &processCount, processes.data());
|
||||
TLLM_CHECK_WITH_INFO(nvmlResult == NVML_SUCCESS || nvmlResult == NVML_ERROR_INSUFFICIENT_SIZE,
|
||||
"Failed to get process count: %s", nvmlErrorString(nvmlResult));
|
||||
"Failed to get process count: %s", nvml->nvmlErrorString(nvmlResult));
|
||||
processes.resize(processCount);
|
||||
}
|
||||
|
||||
@ -120,6 +125,8 @@ protected:
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::shared_ptr<tensorrt_llm::common::NVMLWrapper> mNvml;
|
||||
};
|
||||
|
||||
class VirtualMemoryTest : public VirtualMemoryTestBase
|
||||
|
||||
Loading…
Reference in New Issue
Block a user