[https://nvbugs/5437405][fix] qwen3 235b eagle3 ci (#7000)

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
This commit is contained in:
bhsueh_NV 2025-08-21 13:17:32 +08:00 committed by GitHub
parent 647a52698a
commit ba0a86e0bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 51 additions and 7 deletions

View File

@ -2446,11 +2446,12 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
[
(8, 1, 8, True, True, True, "CUTLASS", False),
(8, 1, 8, True, True, True, "TRTLLM", False),
(8, 1, 8, False, False, False, "TRTLLM", True),
(8, 1, 8, True, True, True, "TRTLLM", True),
],
ids=[
"latency_moe_cutlass", "latency_moe_trtllm",
"latency_moe_trtllm_eagle3"
"latency_moe_cutlass",
"latency_moe_trtllm",
"latency_moe_trtllm_eagle3",
],
)
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
@ -2485,6 +2486,50 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
@skip_pre_blackwell
@pytest.mark.skip_less_mpi_world_size(4)
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
[
(4, 1, 4, False, False, False, "TRTLLM",
True), # TP8 has bug when we use TRTLLM moe backend and eagle3
],
ids=[
"latency_moe_trtllm_eagle3",
],
)
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
cuda_graph, overlap_scheduler, moe_backend, eagle3):
pytorch_config = dict(
disable_overlap_scheduler=not overlap_scheduler,
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
moe_config=MoeConfig(backend=moe_backend))
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
enable_block_reuse=not eagle3)
spec_config = None
if eagle3:
spec_config = EagleDecodingConfig(
max_draft_len=2,
speculative_model_dir=
f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
eagle3_one_model=True)
with LLM(
f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
**pytorch_config,
enable_attention_dp=attention_dp,
kv_cache_config=kv_cache_config,
speculative_config=spec_config) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-4-mini-instruct"

View File

@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8

View File

@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]

View File

@ -69,3 +69,4 @@ l0_gb200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)

View File

@ -19,4 +19,3 @@ l0_gb200_multi_nodes:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (90)

View File

@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384)
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)