mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 10:11:47 +08:00
[None][infra] separate AutoDeploy tests into own stages (#10634)
Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
This commit is contained in:
parent
f7de285a82
commit
62050b2381
@ -2153,8 +2153,12 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
|
||||
// If stageName contains "-FMHA-", add "backend=fmha" to makoArgs
|
||||
// At this point, only tests with backend=fmha or unspecified backend will be run
|
||||
makoArgs += ["backend=fmha"]
|
||||
} else if (stageName.contains("-AutoDeploy-")) {
|
||||
// If stageName contains "-AutoDeploy-", add "backend=autodeploy" to makoArgs
|
||||
// At this point, only tests with backend=autodeploy or unspecified backend will be run
|
||||
makoArgs += ["backend=autodeploy"]
|
||||
} else {
|
||||
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", or "-FMHA-", do not add any backend
|
||||
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", "-FMHA-", or "-AutoDeploy-", do not add any backend
|
||||
// At this point, all tests will be run
|
||||
// For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend
|
||||
}
|
||||
@ -3155,6 +3159,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"A30-Triton-1": ["a30", "l0_a30", 1, 1],
|
||||
"A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
|
||||
"A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
|
||||
"A30-AutoDeploy-1": ["a30", "l0_a30", 1, 1],
|
||||
"A30-CPP-1": ["a30", "l0_a30", 1, 3],
|
||||
"A30-CPP-2": ["a30", "l0_a30", 2, 3],
|
||||
"A30-CPP-3": ["a30", "l0_a30", 3, 3],
|
||||
@ -3166,11 +3171,13 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"H100_PCIe-PyTorch-3": ["h100-cr", "l0_h100", 3, 4],
|
||||
"H100_PCIe-PyTorch-4": ["h100-cr", "l0_h100", 4, 4],
|
||||
"H100_PCIe-PyTorch-Ray-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-AutoDeploy-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-CPP-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 3],
|
||||
"B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 3],
|
||||
"B200_PCIe-PyTorch-3": ["b100-ts2", "l0_b200", 3, 3],
|
||||
"B200_PCIe-AutoDeploy-1": ["b100-ts2", "l0_b200", 1, 1],
|
||||
"RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1],
|
||||
"RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2],
|
||||
"RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2],
|
||||
@ -3262,8 +3269,10 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_H100-4_GPUs-AutoDeploy-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
|
||||
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
|
||||
"DGX_B200-4_GPUs-AutoDeploy-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
|
||||
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
|
||||
|
||||
@ -20,7 +20,6 @@ l0_a30:
|
||||
- unittest/_torch/modeling -k "modeling_qwen_moe"
|
||||
- unittest/_torch/modeling -k "modeling_out_of_tree"
|
||||
- unittest/_torch/modeling -k "modeling_starcoder2"
|
||||
- unittest/_torch/auto_deploy/unit/singlegpu
|
||||
- unittest/_torch/sampler/test_beam_search.py
|
||||
- unittest/_torch/sampler/test_return_logits.py
|
||||
- test_e2e.py::test_openai_completions_with_logit_bias[torch_sampler]
|
||||
@ -244,3 +243,19 @@ l0_a30:
|
||||
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
|
||||
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
|
||||
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
|
||||
# ------------- AutoDeploy Backend Stages ---------------
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*a30*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: autodeploy
|
||||
tests:
|
||||
# TODO (lucaslie): consider more fine-grained split
|
||||
- unittest/_torch/auto_deploy/unit/singlegpu
|
||||
|
||||
@ -94,10 +94,6 @@ l0_b200:
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-CUTLASS]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-256-CUTLASS]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_deepgemm[enable_configurable_moe-dtype1-72-256-2560-DefaultMoeRoutingMethod]
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
|
||||
- unittest/_torch/auto_deploy/unit/singlegpu
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -169,3 +165,20 @@ l0_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[enable_configurable_moe-mtp=disable-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype
|
||||
# ------------- AutoDeploy Backend Stages ---------------
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*b100*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: autodeploy
|
||||
tests:
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
|
||||
- unittest/_torch/auto_deploy/unit/singlegpu
|
||||
|
||||
@ -32,9 +32,6 @@ l0_dgx_b200:
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
|
||||
- accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] TIMEOUT (60)
|
||||
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -203,3 +200,22 @@ l0_dgx_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False]
|
||||
# ------------- AutoDeploy Backend Stages ---------------
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
lte: 4
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*b200*'
|
||||
linux_distribution_name: ubuntu*
|
||||
cpu: x86_64
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: autodeploy
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
|
||||
@ -67,7 +67,6 @@ l0_dgx_b300:
|
||||
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180)
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] TIMEOUT (180)
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
@ -18,7 +18,6 @@ l0_dgx_h100:
|
||||
- unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu2"
|
||||
- unittest/llmapi/test_additional_model_outputs.py -m "gpu2"
|
||||
- unittest/_torch/multi_gpu -m "not post_merge" TIMEOUT (90)
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism
|
||||
# ------------- Disaggregated serving tests ---------------
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
|
||||
@ -44,8 +43,6 @@ l0_dgx_h100:
|
||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
|
||||
- unittest/llmapi/apps/test_disagg_serving_perf_metrics.py
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_cancel_large_context_requests[DeepSeek-V3-Lite-bf16]
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
|
||||
# llmapi
|
||||
- unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
|
||||
# ------------- Skip softmax attention tests ---------------
|
||||
@ -130,8 +127,6 @@ l0_dgx_h100:
|
||||
- disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing]
|
||||
- disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin]
|
||||
- disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin]
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -308,3 +303,22 @@ l0_dgx_h100:
|
||||
- unittest/llmapi/test_async_llm.py -m "gpu4"
|
||||
- ray_orchestrator/RL/test_rl_perf_reproduce.py::test_rl_perf_reproduce[tp2_2instances]
|
||||
- ray_orchestrator/RL/test_rl_perf_reproduce.py::test_rl_perf_reproduce[tp1_4instances]
|
||||
# ------------- AutoDeploy Backend Stages ---------------
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
lte: 4
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*h100*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: autodeploy
|
||||
auto_trigger: others
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
|
||||
@ -131,10 +131,6 @@ l0_dgx_h200:
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
@ -111,16 +111,6 @@ l0_h100:
|
||||
- test_e2e.py::test_openai_chat_harmony
|
||||
- test_e2e.py::test_openai_responses
|
||||
- test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
# ------------- AutoDeploy tests ---------------
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -422,3 +412,28 @@ l0_h100:
|
||||
backend: fmha
|
||||
tests:
|
||||
- test_fmha.py::test_fmha TIMEOUT (90)
|
||||
# ------------- AutoDeploy Backend Stages ---------------
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*h100*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: autodeploy
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- unittest/_torch/auto_deploy/unit/singlegpu
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
|
||||
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
|
||||
- examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate
|
||||
|
||||
@ -230,7 +230,6 @@ full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evi
|
||||
unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911)
|
||||
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
|
||||
accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721)
|
||||
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] SKIP (https://nvbugs/5769712)
|
||||
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] SKIP (https://nvbugs/5769712)
|
||||
test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5701457)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user