From 62050b23815c44dc6deeb04c338248cfa4950929 Mon Sep 17 00:00:00 2001 From: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> Date: Wed, 14 Jan 2026 23:05:26 -0500 Subject: [PATCH] [None][infra] separate AutoDeploy tests into own stages (#10634) Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> --- jenkins/L0_Test.groovy | 11 +++++- .../integration/test_lists/test-db/l0_a30.yml | 17 ++++++++- .../test_lists/test-db/l0_b200.yml | 21 ++++++++--- .../test_lists/test-db/l0_dgx_b200.yml | 22 ++++++++++-- .../test_lists/test-db/l0_dgx_b300.yml | 1 - .../test_lists/test-db/l0_dgx_h100.yml | 24 ++++++++++--- .../test_lists/test-db/l0_dgx_h200.yml | 4 --- .../test_lists/test-db/l0_h100.yml | 35 +++++++++++++------ tests/integration/test_lists/waives.txt | 1 - 9 files changed, 106 insertions(+), 30 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index bd2cdd21a1..901ed9e932 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2153,8 +2153,12 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) { // If stageName contains "-FMHA-", add "backend=fmha" to makoArgs // At this point, only tests with backend=fmha or unspecified backend will be run makoArgs += ["backend=fmha"] + } else if (stageName.contains("-AutoDeploy-")) { + // If stageName contains "-AutoDeploy-", add "backend=autodeploy" to makoArgs + // At this point, only tests with backend=autodeploy or unspecified backend will be run + makoArgs += ["backend=autodeploy"] } else { - // If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", or "-FMHA-", do not add any backend + // If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", "-FMHA-", or "-AutoDeploy-", do not add any backend // At this point, all tests will be run // For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend } @@ -3155,6 +3159,7 @@ def launchTestJobs(pipeline, testFilter) "A30-Triton-1": ["a30", "l0_a30", 1, 1], "A30-PyTorch-1": ["a30", "l0_a30", 1, 2], "A30-PyTorch-2": ["a30", "l0_a30", 2, 2], + "A30-AutoDeploy-1": ["a30", "l0_a30", 1, 1], "A30-CPP-1": ["a30", "l0_a30", 1, 3], "A30-CPP-2": ["a30", "l0_a30", 2, 3], "A30-CPP-3": ["a30", "l0_a30", 3, 3], @@ -3166,11 +3171,13 @@ def launchTestJobs(pipeline, testFilter) "H100_PCIe-PyTorch-3": ["h100-cr", "l0_h100", 3, 4], "H100_PCIe-PyTorch-4": ["h100-cr", "l0_h100", 4, 4], "H100_PCIe-PyTorch-Ray-1": ["h100-cr", "l0_h100", 1, 1], + "H100_PCIe-AutoDeploy-1": ["h100-cr", "l0_h100", 1, 1], "H100_PCIe-CPP-1": ["h100-cr", "l0_h100", 1, 1], "H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 1], "B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 3], "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 3], "B200_PCIe-PyTorch-3": ["b100-ts2", "l0_b200", 3, 3], + "B200_PCIe-AutoDeploy-1": ["b100-ts2", "l0_b200", 1, 1], "RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1], "RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2], "RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2], @@ -3262,8 +3269,10 @@ def launchTestJobs(pipeline, testFilter) "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], + "DGX_H100-4_GPUs-AutoDeploy-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], + "DGX_B200-4_GPUs-AutoDeploy-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true], diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml index 1a6b95fbb6..7578f02fdc 100644 --- a/tests/integration/test_lists/test-db/l0_a30.yml +++ b/tests/integration/test_lists/test-db/l0_a30.yml @@ -20,7 +20,6 @@ l0_a30: - unittest/_torch/modeling -k "modeling_qwen_moe" - unittest/_torch/modeling -k "modeling_out_of_tree" - unittest/_torch/modeling -k "modeling_starcoder2" - - unittest/_torch/auto_deploy/unit/singlegpu - unittest/_torch/sampler/test_beam_search.py - unittest/_torch/sampler/test_return_logits.py - test_e2e.py::test_openai_completions_with_logit_bias[torch_sampler] @@ -244,3 +243,19 @@ l0_a30: - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] +# ------------- AutoDeploy Backend Stages --------------- +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*a30*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: autodeploy + tests: + # TODO (lucaslie): consider more fine-grained split + - unittest/_torch/auto_deploy/unit/singlegpu diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 02616d7eda..4e82657d74 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -94,10 +94,6 @@ l0_b200: - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-CUTLASS] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-256-CUTLASS] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_deepgemm[enable_configurable_moe-dtype1-72-256-2560-DefaultMoeRoutingMethod] - # ------------- AutoDeploy tests --------------- - - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8 - - unittest/_torch/auto_deploy/unit/singlegpu - condition: ranges: system_gpu_count: @@ -169,3 +165,20 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[enable_configurable_moe-mtp=disable-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype +# ------------- AutoDeploy Backend Stages --------------- +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*b100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: autodeploy + tests: + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] + - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8 + - unittest/_torch/auto_deploy/unit/singlegpu diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 84af0aae2b..69972e5000 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -32,9 +32,6 @@ l0_dgx_b200: - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4] - accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] TIMEOUT (60) - - # ------------- AutoDeploy tests --------------- - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 - condition: ranges: system_gpu_count: @@ -203,3 +200,22 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False] +# ------------- AutoDeploy Backend Stages --------------- +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*b200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: pre_merge + backend: autodeploy + orchestrator: mpi + tests: + - unittest/_torch/auto_deploy/unit/multigpu + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] + - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml index c09e3a0415..81031354d3 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b300.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b300.yml @@ -67,7 +67,6 @@ l0_dgx_b300: - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] TIMEOUT (180) - # ------------- AutoDeploy tests --------------- - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 93606c7f73..c4b7d50f3c 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -18,7 +18,6 @@ l0_dgx_h100: - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu2" - unittest/llmapi/test_additional_model_outputs.py -m "gpu2" - unittest/_torch/multi_gpu -m "not post_merge" TIMEOUT (90) - - unittest/_torch/auto_deploy/unit/multigpu - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism # ------------- Disaggregated serving tests --------------- - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] @@ -44,8 +43,6 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] - unittest/llmapi/apps/test_disagg_serving_perf_metrics.py - disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16] - # ------------- AutoDeploy tests --------------- - - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] # llmapi - unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks # ------------- Skip softmax attention tests --------------- @@ -130,8 +127,6 @@ l0_dgx_h100: - disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] - disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] - disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin] - # ------------- AutoDeploy tests --------------- - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 - condition: ranges: system_gpu_count: @@ -308,3 +303,22 @@ l0_dgx_h100: - unittest/llmapi/test_async_llm.py -m "gpu4" - ray_orchestrator/RL/test_rl_perf_reproduce.py::test_rl_perf_reproduce[tp2_2instances] - ray_orchestrator/RL/test_rl_perf_reproduce.py::test_rl_perf_reproduce[tp1_4instances] +# ------------- AutoDeploy Backend Stages --------------- +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: autodeploy + auto_trigger: others + orchestrator: mpi + tests: + - unittest/_torch/auto_deploy/unit/multigpu + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] + - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index 3d3b12fdfb..bd669e222a 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -131,10 +131,6 @@ l0_dgx_h200: - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] - # ------------- AutoDeploy tests --------------- - - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 - - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 994c43a1fc..0e9fc8b8d6 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -111,16 +111,6 @@ l0_h100: - test_e2e.py::test_openai_chat_harmony - test_e2e.py::test_openai_responses - test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B] - # ------------- AutoDeploy tests --------------- - - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] - - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1] - - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False] - - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8 - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 - - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target] - - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3] - - examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate - condition: ranges: system_gpu_count: @@ -422,3 +412,28 @@ l0_h100: backend: fmha tests: - test_fmha.py::test_fmha TIMEOUT (90) +# ------------- AutoDeploy Backend Stages --------------- +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: autodeploy + orchestrator: mpi + tests: + - unittest/_torch/auto_deploy/unit/singlegpu + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1] + - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1] + - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False] + - accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] + - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8 + - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 + - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target] + - examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3] + - examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 7d34eeb7db..c15436fa8f 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -230,7 +230,6 @@ full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evi unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911) accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438) accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721) -accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] SKIP (https://nvbugs/5769712) accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] SKIP (https://nvbugs/5769712) test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5701457)