mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5437405][fix] qwen3 235b eagle3 ci (#7000)
Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
This commit is contained in:
parent
647a52698a
commit
ba0a86e0bb
@ -2446,11 +2446,12 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
|
|||||||
[
|
[
|
||||||
(8, 1, 8, True, True, True, "CUTLASS", False),
|
(8, 1, 8, True, True, True, "CUTLASS", False),
|
||||||
(8, 1, 8, True, True, True, "TRTLLM", False),
|
(8, 1, 8, True, True, True, "TRTLLM", False),
|
||||||
(8, 1, 8, False, False, False, "TRTLLM", True),
|
(8, 1, 8, True, True, True, "TRTLLM", True),
|
||||||
],
|
],
|
||||||
ids=[
|
ids=[
|
||||||
"latency_moe_cutlass", "latency_moe_trtllm",
|
"latency_moe_cutlass",
|
||||||
"latency_moe_trtllm_eagle3"
|
"latency_moe_trtllm",
|
||||||
|
"latency_moe_trtllm_eagle3",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
||||||
@ -2485,6 +2486,50 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
|
|||||||
task = GSM8K(self.MODEL_NAME)
|
task = GSM8K(self.MODEL_NAME)
|
||||||
task.evaluate(llm)
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
@skip_pre_blackwell
|
||||||
|
@pytest.mark.skip_less_mpi_world_size(4)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
|
||||||
|
[
|
||||||
|
(4, 1, 4, False, False, False, "TRTLLM",
|
||||||
|
True), # TP8 has bug when we use TRTLLM moe backend and eagle3
|
||||||
|
],
|
||||||
|
ids=[
|
||||||
|
"latency_moe_trtllm_eagle3",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
|
||||||
|
cuda_graph, overlap_scheduler, moe_backend, eagle3):
|
||||||
|
|
||||||
|
pytorch_config = dict(
|
||||||
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
|
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
|
||||||
|
moe_config=MoeConfig(backend=moe_backend))
|
||||||
|
|
||||||
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
|
||||||
|
enable_block_reuse=not eagle3)
|
||||||
|
spec_config = None
|
||||||
|
if eagle3:
|
||||||
|
spec_config = EagleDecodingConfig(
|
||||||
|
max_draft_len=2,
|
||||||
|
speculative_model_dir=
|
||||||
|
f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
|
||||||
|
eagle3_one_model=True)
|
||||||
|
with LLM(
|
||||||
|
f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
|
||||||
|
tensor_parallel_size=tp_size,
|
||||||
|
pipeline_parallel_size=pp_size,
|
||||||
|
moe_expert_parallel_size=ep_size,
|
||||||
|
**pytorch_config,
|
||||||
|
enable_attention_dp=attention_dp,
|
||||||
|
kv_cache_config=kv_cache_config,
|
||||||
|
speculative_config=spec_config) as llm:
|
||||||
|
|
||||||
|
task = MMLU(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
task = GSM8K(self.MODEL_NAME)
|
||||||
|
task.evaluate(llm)
|
||||||
|
|
||||||
|
|
||||||
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
|
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
|
||||||
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
|
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
|
||||||
|
|||||||
@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
|
|||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
||||||
|
|||||||
@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
|
|||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
|
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
|
||||||
|
|||||||
@ -69,3 +69,4 @@ l0_gb200:
|
|||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
|
||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
|
||||||
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)
|
||||||
|
|||||||
@ -19,4 +19,3 @@ l0_gb200_multi_nodes:
|
|||||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
|
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
|
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
|
||||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (90)
|
|
||||||
|
|||||||
@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
|
|||||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
|
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
|
||||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
|
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
|
||||||
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
|
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
|
||||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384)
|
|
||||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
|
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
|
||||||
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
|
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
|
||||||
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
|
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user