[None][infra] separate AutoDeploy tests into own stages (#10634)

Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
This commit is contained in:
Lucas Liebenwein 2026-01-14 23:05:26 -05:00 committed by GitHub
parent f7de285a82
commit 62050b2381
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 106 additions and 30 deletions

View File

@ -2153,8 +2153,12 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
// If stageName contains "-FMHA-", add "backend=fmha" to makoArgs
// At this point, only tests with backend=fmha or unspecified backend will be run
makoArgs += ["backend=fmha"]
} else if (stageName.contains("-AutoDeploy-")) {
// If stageName contains "-AutoDeploy-", add "backend=autodeploy" to makoArgs
// At this point, only tests with backend=autodeploy or unspecified backend will be run
makoArgs += ["backend=autodeploy"]
} else {
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", or "-FMHA-", do not add any backend
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", "-FMHA-", or "-AutoDeploy-", do not add any backend
// At this point, all tests will be run
// For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend
}
@ -3155,6 +3159,7 @@ def launchTestJobs(pipeline, testFilter)
"A30-Triton-1": ["a30", "l0_a30", 1, 1],
"A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
"A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
"A30-AutoDeploy-1": ["a30", "l0_a30", 1, 1],
"A30-CPP-1": ["a30", "l0_a30", 1, 3],
"A30-CPP-2": ["a30", "l0_a30", 2, 3],
"A30-CPP-3": ["a30", "l0_a30", 3, 3],
@ -3166,11 +3171,13 @@ def launchTestJobs(pipeline, testFilter)
"H100_PCIe-PyTorch-3": ["h100-cr", "l0_h100", 3, 4],
"H100_PCIe-PyTorch-4": ["h100-cr", "l0_h100", 4, 4],
"H100_PCIe-PyTorch-Ray-1": ["h100-cr", "l0_h100", 1, 1],
"H100_PCIe-AutoDeploy-1": ["h100-cr", "l0_h100", 1, 1],
"H100_PCIe-CPP-1": ["h100-cr", "l0_h100", 1, 1],
"H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 1],
"B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 3],
"B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 3],
"B200_PCIe-PyTorch-3": ["b100-ts2", "l0_b200", 3, 3],
"B200_PCIe-AutoDeploy-1": ["b100-ts2", "l0_b200", 1, 1],
"RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1],
"RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2],
"RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2],
@ -3262,8 +3269,10 @@ def launchTestJobs(pipeline, testFilter)
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-AutoDeploy-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-4_GPUs-AutoDeploy-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],

View File

@ -20,7 +20,6 @@ l0_a30:
- unittest/_torch/modeling -k "modeling_qwen_moe"
- unittest/_torch/modeling -k "modeling_out_of_tree"
- unittest/_torch/modeling -k "modeling_starcoder2"
- unittest/_torch/auto_deploy/unit/singlegpu
- unittest/_torch/sampler/test_beam_search.py
- unittest/_torch/sampler/test_return_logits.py
- test_e2e.py::test_openai_completions_with_logit_bias[torch_sampler]
@ -244,3 +243,19 @@ l0_a30:
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
# ------------- AutoDeploy Backend Stages ---------------
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*a30*'
linux_distribution_name: ubuntu*
terms:
stage: pre_merge
backend: autodeploy
tests:
# TODO (lucaslie): consider more fine-grained split
- unittest/_torch/auto_deploy/unit/singlegpu

View File

@ -94,10 +94,6 @@ l0_b200:
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-CUTLASS]
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-256-CUTLASS]
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_deepgemm[enable_configurable_moe-dtype1-72-256-2560-DefaultMoeRoutingMethod]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- unittest/_torch/auto_deploy/unit/singlegpu
- condition:
ranges:
system_gpu_count:
@ -169,3 +165,20 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[enable_configurable_moe-mtp=disable-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype
# ------------- AutoDeploy Backend Stages ---------------
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*b100*'
linux_distribution_name: ubuntu*
terms:
stage: pre_merge
backend: autodeploy
tests:
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- unittest/_torch/auto_deploy/unit/singlegpu

View File

@ -32,9 +32,6 @@ l0_dgx_b200:
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
- accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] TIMEOUT (60)
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- condition:
ranges:
system_gpu_count:
@ -203,3 +200,22 @@ l0_dgx_b200:
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
# ------------- AutoDeploy Backend Stages ---------------
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*b200*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: pre_merge
backend: autodeploy
orchestrator: mpi
tests:
- unittest/_torch/auto_deploy/unit/multigpu
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16

View File

@ -67,7 +67,6 @@ l0_dgx_b300:
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] TIMEOUT (180)
# ------------- AutoDeploy tests ---------------
- condition:
ranges:
system_gpu_count:

View File

@ -18,7 +18,6 @@ l0_dgx_h100:
- unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu2"
- unittest/llmapi/test_additional_model_outputs.py -m "gpu2"
- unittest/_torch/multi_gpu -m "not post_merge" TIMEOUT (90)
- unittest/_torch/auto_deploy/unit/multigpu
- unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism
# ------------- Disaggregated serving tests ---------------
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
@ -44,8 +43,6 @@ l0_dgx_h100:
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
- unittest/llmapi/apps/test_disagg_serving_perf_metrics.py
- disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
# llmapi
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
# ------------- Skip softmax attention tests ---------------
@ -130,8 +127,6 @@ l0_dgx_h100:
- disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing]
- disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin]
- disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- condition:
ranges:
system_gpu_count:
@ -308,3 +303,22 @@ l0_dgx_h100:
- unittest/llmapi/test_async_llm.py -m "gpu4"
- ray_orchestrator/RL/test_rl_perf_reproduce.py::test_rl_perf_reproduce[tp2_2instances]
- ray_orchestrator/RL/test_rl_perf_reproduce.py::test_rl_perf_reproduce[tp1_4instances]
# ------------- AutoDeploy Backend Stages ---------------
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*h100*'
linux_distribution_name: ubuntu*
terms:
stage: pre_merge
backend: autodeploy
auto_trigger: others
orchestrator: mpi
tests:
- unittest/_torch/auto_deploy/unit/multigpu
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16

View File

@ -131,10 +131,6 @@ l0_dgx_h200:
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
- condition:
ranges:
system_gpu_count:

View File

@ -111,16 +111,6 @@ l0_h100:
- test_e2e.py::test_openai_chat_harmony
- test_e2e.py::test_openai_responses
- test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
- examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate
- condition:
ranges:
system_gpu_count:
@ -422,3 +412,28 @@ l0_h100:
backend: fmha
tests:
- test_fmha.py::test_fmha TIMEOUT (90)
# ------------- AutoDeploy Backend Stages ---------------
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*h100*'
linux_distribution_name: ubuntu*
terms:
stage: pre_merge
backend: autodeploy
orchestrator: mpi
tests:
- unittest/_torch/auto_deploy/unit/singlegpu
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
- examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate

View File

@ -230,7 +230,6 @@ full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evi
unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911)
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721)
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] SKIP (https://nvbugs/5769712)
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] SKIP (https://nvbugs/5769712)
test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5701457)