mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[https://nvbugs/5355219][fix] Fix trtllm moe backend test config and Qwen3 MoE multi node (#7724)
Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
parent
aaa381d169
commit
7df515e335
@ -1892,8 +1892,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
]
|
||||
fullSet += SBSASlurmTestConfigs.keySet()
|
||||
|
||||
multiNodesSBSAConfigs = (1..7).collectEntries { i ->
|
||||
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 7, 8, 2]]
|
||||
multiNodesSBSAConfigs = (1..8).collectEntries { i ->
|
||||
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 8, 8, 2]]
|
||||
}
|
||||
fullSet += multiNodesSBSAConfigs.keySet()
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import torch
|
||||
from torch import nn
|
||||
from transformers import Qwen3MoeConfig
|
||||
|
||||
from tensorrt_llm._ipc_utils import can_access_peer
|
||||
from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
|
||||
BaseWeightMapper
|
||||
|
||||
@ -187,6 +188,8 @@ class Qwen3MoEDecoderLayer(DecoderLayer):
|
||||
strategy=model_config.allreduce_strategy)
|
||||
self.next_layer_layernorm: RMSNorm = None
|
||||
|
||||
self.is_p2p_supported = can_access_peer(model_config.mapping)
|
||||
|
||||
self.fusion_config = EagerFusionConfig()
|
||||
self.enable_fusion = os.environ.get(
|
||||
"TRTLLM_QWEN3_EAGER_FUSION_DISABLED", "0") == "0"
|
||||
@ -242,11 +245,11 @@ class Qwen3MoEDecoderLayer(DecoderLayer):
|
||||
hidden_states, residual)
|
||||
|
||||
# Note: this fusion pattern is only supported for TRTLLM-nvfp4 backend now
|
||||
do_finalize = not (hidden_states.shape[0]
|
||||
<= self.moe_allreduce.max_token
|
||||
and self.fusion_config.POST_MOE_FUSION
|
||||
and self.model_config.moe_backend == 'TRTLLM'
|
||||
and self.mlp.experts.has_nvfp4)
|
||||
do_finalize = not (
|
||||
hidden_states.shape[0] <= self.moe_allreduce.max_token
|
||||
and self.fusion_config.POST_MOE_FUSION
|
||||
and self.model_config.moe_backend == 'TRTLLM'
|
||||
and self.mlp.experts.has_nvfp4 and self.is_p2p_supported)
|
||||
|
||||
hidden_states = self.mlp(
|
||||
hidden_states,
|
||||
|
||||
@ -2296,13 +2296,17 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
|
||||
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
|
||||
[
|
||||
(8, 1, 8, True, True, True, "CUTLASS", False),
|
||||
(8, 1, 8, False, True, True, "TRTLLM", False),
|
||||
(8, 1, 8, True, True, True, "TRTLLM", False),
|
||||
(8, 1, 8, False, True, True, "TRTLLM", True),
|
||||
(8, 1, 8, True, True, True, "TRTLLM", True),
|
||||
],
|
||||
ids=[
|
||||
"latency_moe_cutlass",
|
||||
"latency_moe_trtllm",
|
||||
"latency_moe_trtllm_attention_dp",
|
||||
"latency_moe_trtllm_eagle3",
|
||||
"latency_moe_trtllm_eagle3_attention_dp",
|
||||
],
|
||||
)
|
||||
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
||||
|
||||
@ -537,6 +537,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||
|
||||
@ -18,4 +18,5 @@ l0_gb200_multi_nodes:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180)
|
||||
|
||||
@ -269,6 +269,7 @@ examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SK
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3_attention_dp] SKIP (https://nvbugs/5437384)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444095)
|
||||
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user