test: add more tests for GB200 with 8 GPUs/2 nodes in L0 tests (#5397)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
Yi Zhang 2025-07-01 13:06:47 +08:00 committed by Zhenhuan Chen
parent cb9f596dbe
commit 73d30a23c7
2 changed files with 7 additions and 1 deletions

View File

@ -1824,7 +1824,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
fullSet += SBSASlurmTestConfigs.keySet()
multiNodesSBSAConfigs = [
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 1, 8, 2],
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 3, 8, 2],
]
fullSet += multiNodesSBSAConfigs.keySet()

View File

@ -14,3 +14,6 @@ l0_gb200_multi_nodes:
backend: pytorch
tests:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)