diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp index e9812b52c0..251ce440b5 100644 --- a/cpp/tensorrt_llm/thop/allreduceOp.cpp +++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp @@ -55,6 +55,7 @@ #include #include +#include #include #include @@ -914,91 +915,181 @@ private: cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported, mIsMNNVLSupported}; } - bool checkMNNVLSupport(int device_id) + // Structure to hold MNNVL fabric info for comparison across ranks + struct MNNVLFabricInfo { + char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; + unsigned int cliqueId; + bool isValid; + + MNNVLFabricInfo() + : cliqueId(0) + , isValid(false) + { + std::memset(clusterUuid, 0, NVML_GPU_FABRIC_UUID_LEN); + } + + bool operator==(MNNVLFabricInfo const& other) const + { + if (!isValid || !other.isValid) + { + return false; + } + return std::memcmp(clusterUuid, other.clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0 + && cliqueId == other.cliqueId; + } + + bool operator!=(MNNVLFabricInfo const& other) const + { + return !(*this == other); + } + + // Format cluster UUID as hex string for logging + std::string getClusterUuidString() const + { + std::string result; + result.reserve(NVML_GPU_FABRIC_UUID_LEN * 2); + for (int i = 0; i < NVML_GPU_FABRIC_UUID_LEN; ++i) + { + char buf[3]; + std::snprintf(buf, sizeof(buf), "%02x", static_cast(clusterUuid[i])); + result += buf; + } + return result; + } + }; + + // Get MNNVL fabric info from a device. Returns fabric info with isValid=true if device supports MNNVL. + MNNVLFabricInfo getMNNVLFabricInfo(int deviceId) + { + MNNVLFabricInfo info; + #if ENABLE_MULTI_DEVICE // 1. Check CUDA driver version (needs >= 12.0.10) - int cuda_driver_version = -1; - TLLM_CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version)); - if (cuda_driver_version < 12010) + int cudaDriverVersion = -1; + TLLM_CUDA_CHECK(cudaDriverGetVersion(&cudaDriverVersion)); + if (cudaDriverVersion < 12010) { - TLLM_LOG_DEBUG("MNNVL check: CUDA Driver version %d < 12010", cuda_driver_version); - return false; + TLLM_LOG_DEBUG("MNNVL check: CUDA Driver version %d < 12010", cudaDriverVersion); + return info; } // 2. Check multicast support - CUdevice cu_device; - TLLM_CU_CHECK(cuDeviceGet(&cu_device, device_id)); - auto cuda_driver = tensorrt_llm::common::CUDADriverWrapper::getInstance(); + CUdevice cuDevice; + TLLM_CU_CHECK(cuDeviceGet(&cuDevice, deviceId)); + auto cudaDriver = tensorrt_llm::common::CUDADriverWrapper::getInstance(); - int multicast_supported = 0; - TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute( - &multicast_supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, cu_device)); - if (!multicast_supported) + int multicastSupported = 0; + TLLM_CU_CHECK( + cudaDriver->cuDeviceGetAttribute(&multicastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, cuDevice)); + if (!multicastSupported) { - TLLM_LOG_DEBUG("MNNVL check: Device %d does not support multicast", device_id); - return false; + TLLM_LOG_DEBUG("MNNVL check: Device %d does not support multicast", deviceId); + return info; } // 3. Check fabric handle support - int fabric_handle_supported = 0; - TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute( - &fabric_handle_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cu_device)); - if (!fabric_handle_supported) + int fabricHandleSupported = 0; + TLLM_CU_CHECK(cudaDriver->cuDeviceGetAttribute( + &fabricHandleSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDevice)); + if (!fabricHandleSupported) { - TLLM_LOG_DEBUG("MNNVL check: Device %d does not support fabric handles", device_id); - return false; + TLLM_LOG_DEBUG("MNNVL check: Device %d does not support fabric handles", deviceId); + return info; } - // 4. Check NVML GPU Fabric Info - nvmlDevice_t nvml_device; - NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(device_id, &nvml_device)); + // 4. Check NVML GPU Fabric Info using versioned API + nvmlDevice_t nvmlDevice; + nvmlReturn_t nvmlResult = nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice); + if (nvmlResult != NVML_SUCCESS) + { + TLLM_LOG_DEBUG("MNNVL check: Failed to get NVML device handle for device %d - error=%d", deviceId, + static_cast(nvmlResult)); + return info; + } - nvmlGpuFabricInfo_t fabric_info; - NVML_CHECK_THROW(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info)); - - // Check if fabric is fully initialized - if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS) + nvmlGpuFabricInfoV_t fabricInfoV; + std::memset(&fabricInfoV, 0, sizeof(fabricInfoV)); + fabricInfoV.version = NVML_STRUCT_VERSION(GpuFabricInfo, 3); + nvmlResult = nvmlDeviceGetGpuFabricInfoV(nvmlDevice, &fabricInfoV); + if (nvmlResult != NVML_SUCCESS) { TLLM_LOG_DEBUG( - "MNNVL check: Fabric state not complete - state=%u status=%u", fabric_info.state, fabric_info.status); - return false; + "MNNVL check: nvmlDeviceGetGpuFabricInfoV failed for device %d - error=%d (not supported or " + "no fabric manager)", + deviceId, static_cast(nvmlResult)); + return info; + } + + // Check if fabric is fully initialized + if (fabricInfoV.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabricInfoV.status != NVML_SUCCESS) + { + TLLM_LOG_DEBUG( + "MNNVL check: Fabric state not complete - state=%u status=%u", fabricInfoV.state, fabricInfoV.status); + return info; + } + + // Check if clusterUuid is valid (not all zeros) + // If clusterUuid is all zeros, the GPU is not actually part of an NVLink fabric + bool clusterUuidValid = false; + for (int i = 0; i < NVML_GPU_FABRIC_UUID_LEN; ++i) + { + if (fabricInfoV.clusterUuid[i] != 0) + { + clusterUuidValid = true; + break; + } + } + if (!clusterUuidValid) + { + TLLM_LOG_DEBUG( + "MNNVL check: Device %d has invalid (all-zero) clusterUuid - not part of NVLink fabric", deviceId); + return info; } // 5. Check NVLink links are active (similar to Python support_nvlink(True)) - unsigned int active_links = 0; - unsigned int available_links = 0; + unsigned int activeLinks = 0; + unsigned int availableLinks = 0; for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++) { - unsigned int cap_p2p = 0; - nvmlReturn_t cap_result - = nvmlDeviceGetNvLinkCapability(nvml_device, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &cap_p2p); - if (cap_result == NVML_SUCCESS && cap_p2p) + unsigned int capP2p = 0; + nvmlReturn_t capResult + = nvmlDeviceGetNvLinkCapability(nvmlDevice, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &capP2p); + if (capResult == NVML_SUCCESS && capP2p) { - available_links++; - nvmlEnableState_t link_state; - if (nvmlDeviceGetNvLinkState(nvml_device, link, &link_state) == NVML_SUCCESS - && link_state == NVML_FEATURE_ENABLED) + availableLinks++; + nvmlEnableState_t linkState; + if (nvmlDeviceGetNvLinkState(nvmlDevice, link, &linkState) == NVML_SUCCESS + && linkState == NVML_FEATURE_ENABLED) { - active_links++; + activeLinks++; } } } - bool all_links_up = (active_links == available_links && available_links > 0); - if (!all_links_up) + bool allLinksUp = (activeLinks == availableLinks && availableLinks > 0); + if (!allLinksUp) { TLLM_LOG_DEBUG( - "MNNVL check: Not all NVLink links active - active=%u available=%u", active_links, available_links); - return false; + "MNNVL check: Not all NVLink links active - active=%u available=%u", activeLinks, availableLinks); + return info; } - TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (fabric_clique=%u)", device_id, fabric_info.cliqueId); - return true; -#else - return false; + // Device supports MNNVL - copy fabric info + std::memcpy(info.clusterUuid, fabricInfoV.clusterUuid, NVML_GPU_FABRIC_UUID_LEN); + info.cliqueId = fabricInfoV.cliqueId; + info.isValid = true; + + TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (clusterUuid=%s, cliqueId=%u)", deviceId, + info.getClusterUuidString().c_str(), fabricInfoV.cliqueId); #endif + return info; + } + + bool checkMNNVLSupport(int deviceId) + { + return getMNNVLFabricInfo(deviceId).isValid; } void setGroupTopology() @@ -1111,84 +1202,131 @@ private: } } - // For inter-node groups, check MNNVL support + // For inter-node groups, check MNNVL support by comparing fabric info (cluster UUID and clique ID) + // Two GPUs are connected via NVLink in MNNVL if they share the same cluster UUID and clique ID. + // See: http://docs.nvidia.com/deploy/nvml-api/index.html#structnvmlGpuFabricInfo__v2__t if (is_inter_node) { - TLLM_LOG_INFO("Found inter-node TP group for rank %d, checking MNNVL support", rank); + TLLM_LOG_INFO("Found inter-node TP group for rank %d, checking MNNVL support via fabric info", rank); - // Check MNNVL support on local device(s) - bool local_mnnvl_supported = false; + // Get MNNVL fabric info on local device + MNNVLFabricInfo localFabricInfo; if (!local_group.empty()) { - // Check MNNVL on first device in local group (all devices on same node should have same MNNVL status) - int check_device = *local_group.begin(); - local_mnnvl_supported = checkMNNVLSupport(check_device); + // Get fabric info from first device in local group + int checkDevice = *local_group.begin(); + localFabricInfo = getMNNVLFabricInfo(checkDevice); } - // Gather MNNVL status from all ranks in the group - int local_mnnvl_status = local_mnnvl_supported ? 1 : 0; - std::vector all_mnnvl_status(mGroup.size()); + // Gather fabric info from all ranks in the group + // We need to share: isValid (1 byte), clusterUuid (16 bytes), cliqueId (4 bytes) = 21 bytes + // Pack into a structure for transmission + constexpr size_t kFabricInfoPackedSize = 1 + NVML_GPU_FABRIC_UUID_LEN + sizeof(unsigned int); + std::vector localPackedInfo(kFabricInfoPackedSize); + localPackedInfo[0] = localFabricInfo.isValid ? 1 : 0; + std::memcpy(&localPackedInfo[1], localFabricInfo.clusterUuid, NVML_GPU_FABRIC_UUID_LEN); + std::memcpy( + &localPackedInfo[1 + NVML_GPU_FABRIC_UUID_LEN], &localFabricInfo.cliqueId, sizeof(unsigned int)); - std::visit(overloaded{[&](std::shared_ptr& comm_ptr) + std::vector allPackedInfo(kFabricInfoPackedSize * mGroup.size()); + + std::visit(overloaded{[&](std::shared_ptr& commPtr) { - // For NCCL comm, use MPI to gather status - // Use MPI allgather to collect MNNVL status - // Create a sub-communicator for the group - std::vector group_ranks(mGroup.begin(), mGroup.end()); - MPI_Group world_group, new_group; - MPI_Comm group_comm; - MPI_Comm_group(COMM_SESSION, &world_group); - MPI_Group_incl(world_group, group_ranks.size(), group_ranks.data(), &new_group); - MPI_Comm_create_group(COMM_SESSION, new_group, 0, &group_comm); + // For NCCL comm, use MPI to gather fabric info + std::vector groupRanks(mGroup.begin(), mGroup.end()); + MPI_Group worldGroup, newGroup; + MPI_Comm groupComm; + MPI_Comm_group(COMM_SESSION, &worldGroup); + MPI_Group_incl(worldGroup, groupRanks.size(), groupRanks.data(), &newGroup); + MPI_Comm_create_group(COMM_SESSION, newGroup, 0, &groupComm); - if (group_comm != MPI_COMM_NULL) + if (groupComm != MPI_COMM_NULL) { - MPI_Allgather(&local_mnnvl_status, 1, MPI_INT, all_mnnvl_status.data(), 1, MPI_INT, - group_comm); - MPI_Comm_free(&group_comm); + MPI_Allgather(localPackedInfo.data(), kFabricInfoPackedSize, MPI_CHAR, + allPackedInfo.data(), kFabricInfoPackedSize, MPI_CHAR, groupComm); + MPI_Comm_free(&groupComm); } - MPI_Group_free(&new_group); - MPI_Group_free(&world_group); + MPI_Group_free(&newGroup); + MPI_Group_free(&worldGroup); }, [&](c10::intrusive_ptr& torchPg) { - // For ProcessGroup, use allgather directly - // Note: This assumes the ProcessGroup is already set up for the correct group - std::vector input_tensors - = {torch::tensor({local_mnnvl_status}, torch::kInt32)}; - std::vector> output_tensors(1); - output_tensors[0].resize(mGroup.size()); - auto work = torchPg->allgather(output_tensors, input_tensors); + // For ProcessGroup, use allgather with byte tensor + auto inputTensor = torch::from_blob( + localPackedInfo.data(), {static_cast(kFabricInfoPackedSize)}, torch::kUInt8) + .clone(); + std::vector inputTensors = {inputTensor}; + std::vector> outputTensors(1); + outputTensors[0].resize(mGroup.size()); + for (size_t i = 0; i < mGroup.size(); ++i) + { + outputTensors[0][i] + = torch::empty({static_cast(kFabricInfoPackedSize)}, torch::kUInt8); + } + auto work = torchPg->allgather(outputTensors, inputTensors); if (work) { work->wait(); for (size_t i = 0; i < mGroup.size(); ++i) { - all_mnnvl_status[i] = output_tensors[0][i].item(); + std::memcpy(&allPackedInfo[i * kFabricInfoPackedSize], + outputTensors[0][i].data_ptr(), kFabricInfoPackedSize); } } }}, mNcclComm); - // Check if all ranks support MNNVL - bool all_ranks_support_mnnvl = true; - for (int status : all_mnnvl_status) + // Unpack and compare fabric info from all ranks + // All ranks must have valid fabric info AND share the same cluster UUID and clique ID + bool allRanksMnnvlConnected = true; + MNNVLFabricInfo referenceFabricInfo; + bool haveReference = false; + + for (size_t i = 0; i < mGroup.size(); ++i) { - if (status == 0) + MNNVLFabricInfo rankFabricInfo; + rankFabricInfo.isValid = allPackedInfo[i * kFabricInfoPackedSize] != 0; + std::memcpy(rankFabricInfo.clusterUuid, &allPackedInfo[i * kFabricInfoPackedSize + 1], + NVML_GPU_FABRIC_UUID_LEN); + std::memcpy(&rankFabricInfo.cliqueId, + &allPackedInfo[i * kFabricInfoPackedSize + 1 + NVML_GPU_FABRIC_UUID_LEN], sizeof(unsigned int)); + + if (!rankFabricInfo.isValid) { - all_ranks_support_mnnvl = false; + TLLM_LOG_DEBUG("MNNVL check: Rank %zu does not have valid fabric info", i); + allRanksMnnvlConnected = false; + break; + } + + if (!haveReference) + { + referenceFabricInfo = rankFabricInfo; + haveReference = true; + } + else if (rankFabricInfo != referenceFabricInfo) + { + // Fabric info mismatch - ranks are not in the same NVLink fabric + TLLM_LOG_DEBUG("MNNVL check: Rank %zu has different fabric info (clique=%u vs reference clique=%u)", + i, rankFabricInfo.cliqueId, referenceFabricInfo.cliqueId); + allRanksMnnvlConnected = false; break; } } - // For inter-node: MNNVL support means all nodes have MNNVL - // Also need local NVLink for optimal performance - mIsMNNVLSupported = mIsNVLINKSupported && all_ranks_support_mnnvl; + // For inter-node: MNNVL support requires all ranks to be in the same fabric (same cluster UUID and clique + // ID) Also need local NVLink for optimal performance + mIsMNNVLSupported = mIsNVLINKSupported && allRanksMnnvlConnected; mIsP2PSupported = false; // P2P doesn't work across nodes - TLLM_LOG_INFO("Inter-node topology: local_NVLink=%d, local_MNNVL=%d, all_ranks_MNNVL=%d, final_MNNVL=%d", - mIsNVLINKSupported ? 1 : 0, local_mnnvl_status, all_ranks_support_mnnvl ? 1 : 0, + TLLM_LOG_INFO( + "Inter-node topology: localNVLink=%d, localFabricValid=%d, allRanksSameFabric=%d, finalMNNVL=%d", + mIsNVLINKSupported ? 1 : 0, localFabricInfo.isValid ? 1 : 0, allRanksMnnvlConnected ? 1 : 0, mIsMNNVLSupported ? 1 : 0); + if (mIsMNNVLSupported && haveReference) + { + TLLM_LOG_INFO("MNNVL enabled: All ranks share fabric (clusterUuid=%s, cliqueId=%u)", + referenceFabricInfo.getClusterUuidString().c_str(), referenceFabricInfo.cliqueId); + } } else { diff --git a/tensorrt_llm/_torch/compilation/multi_stream/auto_multi_stream.py b/tensorrt_llm/_torch/compilation/multi_stream/auto_multi_stream.py index bc0cd984fc..cfc3ef93be 100644 --- a/tensorrt_llm/_torch/compilation/multi_stream/auto_multi_stream.py +++ b/tensorrt_llm/_torch/compilation/multi_stream/auto_multi_stream.py @@ -39,6 +39,7 @@ def estimate_time(node: Node) -> int: moe_ops = { torch.ops.trtllm.fp4_block_scale_moe_runner.default, torch.ops.trtllm.fused_moe.default, + torch.ops.trtllm.moe_custom_op.default, } gemm_ops = {