mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[https://nvbugs/5437405][fix] qwen3 235b eagle3 ci (#7000)
Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
This commit is contained in:
parent
647a52698a
commit
ba0a86e0bb
@ -2446,11 +2446,12 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
|
||||
[
|
||||
(8, 1, 8, True, True, True, "CUTLASS", False),
|
||||
(8, 1, 8, True, True, True, "TRTLLM", False),
|
||||
(8, 1, 8, False, False, False, "TRTLLM", True),
|
||||
(8, 1, 8, True, True, True, "TRTLLM", True),
|
||||
],
|
||||
ids=[
|
||||
"latency_moe_cutlass", "latency_moe_trtllm",
|
||||
"latency_moe_trtllm_eagle3"
|
||||
"latency_moe_cutlass",
|
||||
"latency_moe_trtllm",
|
||||
"latency_moe_trtllm_eagle3",
|
||||
],
|
||||
)
|
||||
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
||||
@ -2485,6 +2486,50 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_blackwell
|
||||
@pytest.mark.skip_less_mpi_world_size(4)
|
||||
@pytest.mark.parametrize(
|
||||
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
|
||||
[
|
||||
(4, 1, 4, False, False, False, "TRTLLM",
|
||||
True), # TP8 has bug when we use TRTLLM moe backend and eagle3
|
||||
],
|
||||
ids=[
|
||||
"latency_moe_trtllm_eagle3",
|
||||
],
|
||||
)
|
||||
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
|
||||
cuda_graph, overlap_scheduler, moe_backend, eagle3):
|
||||
|
||||
pytorch_config = dict(
|
||||
disable_overlap_scheduler=not overlap_scheduler,
|
||||
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
|
||||
moe_config=MoeConfig(backend=moe_backend))
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
|
||||
enable_block_reuse=not eagle3)
|
||||
spec_config = None
|
||||
if eagle3:
|
||||
spec_config = EagleDecodingConfig(
|
||||
max_draft_len=2,
|
||||
speculative_model_dir=
|
||||
f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
|
||||
eagle3_one_model=True)
|
||||
with LLM(
|
||||
f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
**pytorch_config,
|
||||
enable_attention_dp=attention_dp,
|
||||
kv_cache_config=kv_cache_config,
|
||||
speculative_config=spec_config) as llm:
|
||||
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
|
||||
|
||||
@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
||||
|
||||
@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
|
||||
|
||||
@ -69,3 +69,4 @@ l0_gb200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)
|
||||
|
||||
@ -19,4 +19,3 @@ l0_gb200_multi_nodes:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (90)
|
||||
|
||||
@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
|
||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user