move some test cases of TensorRT backend back (#5232)

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-06-17 17:03:11 +08:00 · 2025-06-17 17:03:11 +08:00 · 517c1ecf72
commit 517c1ecf72
parent faca19c2f0
5 changed files with 45 additions and 45 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -1527,6 +1527,12 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
        "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
        "A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
        "A10-CPP-1": ["a10", "l0_a10", 1, 1],
+        "A10-TensorRT-1": ["a10", "l0_a10", 1, 6],
+        "A10-TensorRT-2": ["a10", "l0_a10", 2, 6],
+        "A10-TensorRT-3": ["a10", "l0_a10", 3, 6],
+        "A10-TensorRT-4": ["a10", "l0_a10", 4, 6],
+        "A10-TensorRT-5": ["a10", "l0_a10", 5, 6],
+        "A10-TensorRT-6": ["a10", "l0_a10", 6, 6],
        "A30-Triton-1": ["a30", "l0_a30", 1, 1],
        "A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
        "A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
@ -1538,19 +1544,19 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
        "H100_PCIe-PyTorch-2": ["h100-cr", "l0_h100", 2, 3],
        "H100_PCIe-PyTorch-3": ["h100-cr", "l0_h100", 3, 3],
        "H100_PCIe-CPP-1": ["h100-cr", "l0_h100", 1, 1],
+        "H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 2],
+        "H100_PCIe-TensorRT-2": ["h100-cr", "l0_h100", 2, 2],
        "B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 2],
        "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 2],
+        "B200_PCIe-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2],
+        "B200_PCIe-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2],
        "RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1],
+        "RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2],
+        "RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2],
        // Currently post-merge test stages only run tests with "stage: post_merge" mako
        // in the test-db. This behavior may change in the future.
-        "A10-TensorRT-[Post-Merge]-1": ["a10", "l0_a10", 1, 8],
-        "A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 8],
-        "A10-TensorRT-[Post-Merge]-3": ["a10", "l0_a10", 3, 8],
-        "A10-TensorRT-[Post-Merge]-4": ["a10", "l0_a10", 4, 8],
-        "A10-TensorRT-[Post-Merge]-5": ["a10", "l0_a10", 5, 8],
-        "A10-TensorRT-[Post-Merge]-6": ["a10", "l0_a10", 6, 8],
-        "A10-TensorRT-[Post-Merge]-7": ["a10", "l0_a10", 7, 8],
-        "A10-TensorRT-[Post-Merge]-8": ["a10", "l0_a10", 8, 8],
+        "A10-TensorRT-[Post-Merge]-1": ["a10", "l0_a10", 1, 2],
+        "A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2],
        "A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 6],
        "A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 6],
        "A30-TensorRT-[Post-Merge]-3": ["a30", "l0_a30", 3, 6],
@ -1575,18 +1581,12 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
        "L40S-TensorRT-[Post-Merge]-5": ["l40s", "l0_l40s", 5, 5],
        "H100_PCIe-PyTorch-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
        "H100_PCIe-CPP-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
-        "H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 7],
-        "H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 7],
-        "H100_PCIe-TensorRT-[Post-Merge]-3": ["h100-cr", "l0_h100", 3, 7],
-        "H100_PCIe-TensorRT-[Post-Merge]-4": ["h100-cr", "l0_h100", 4, 7],
-        "H100_PCIe-TensorRT-[Post-Merge]-5": ["h100-cr", "l0_h100", 5, 7],
-        "H100_PCIe-TensorRT-[Post-Merge]-6": ["h100-cr", "l0_h100", 6, 7],
-        "H100_PCIe-TensorRT-[Post-Merge]-7": ["h100-cr", "l0_h100", 7, 7],
+        "H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 5],
+        "H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 5],
+        "H100_PCIe-TensorRT-[Post-Merge]-3": ["h100-cr", "l0_h100", 3, 5],
+        "H100_PCIe-TensorRT-[Post-Merge]-4": ["h100-cr", "l0_h100", 4, 5],
+        "H100_PCIe-TensorRT-[Post-Merge]-5": ["h100-cr", "l0_h100", 5, 5],
        "B200_PCIe-Triton-Python-[Post-Merge]-1": ["b100-ts2", "l0_b200", 1, 1],
-        "B200_PCIe-[Post-Merge]-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2],
-        "B200_PCIe-[Post-Merge]-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2],
-        "RTX5080-[Post-Merge]-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2],
-        "RTX5080-[Post-Merge]-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2],
        "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
        "H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
        "DGX_H200-8_GPUs-PyTorch-[Post-Merge]-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@ -51,7 +51,7 @@ l0_a10:
      - '*a10*'
      linux_distribution_name: ubuntu*
    terms:
-      stage: post_merge
+      stage: pre_merge
      backend: tensorrt
  tests:
  # ------------- TRT tests ---------------
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@ -64,7 +64,7 @@ l0_b200:
      - '*b100*'
      linux_distribution_name: ubuntu*
    terms:
-      stage: post_merge
+      stage: pre_merge
      backend: tensorrt
  tests:
  # ------------- TRT tests ---------------
--- a/tests/integration/test_lists/test-db/l0_gb203.yml
+++ b/tests/integration/test_lists/test-db/l0_gb203.yml
@ -10,7 +10,7 @@ l0_gb203:
      - '*gb203*'
      linux_distribution_name: ubuntu*
    terms:
-      stage: post_merge
+      stage: pre_merge
      backend: tensorrt
  tests:
  # ------------- TRT tests ---------------
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -120,7 +120,7 @@ l0_h100:
      - '*h100*'
      linux_distribution_name: ubuntu*
    terms:
-      stage: post_merge
+      stage: pre_merge
      backend: tensorrt
  tests:
  # ------------- TRT tests ---------------
@ -129,52 +129,30 @@ l0_h100:
  - unittest/trt/quantization/test_weight_only_quant_matmul.py
  - unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py
  - test_e2e.py::test_trtllm_bench_sanity[-extra_config-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
-  - test_e2e.py::test_trtllm_bench_sanity[-extra_config-non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
  - test_e2e.py::test_trtllm_bench_sanity[--streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
-  - test_e2e.py::test_trtllm_bench_sanity[--non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
  - test_e2e.py::test_trtllm_bench_latency_sanity[FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
  - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]
  - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test
  - test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
-  - test_e2e.py::test_trtllm_bench_iteration_log[TRT-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
  - test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B]
-  - accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive # 6 mins
-  - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] # 5 mins
-  - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] # 5 mins
  - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b]
  - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b]
  - examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct]
  - examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it]
  - examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8]
-  - examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
-  - unittest/trt/model_api/test_model_level_api.py # 9 mins on H100
-  - unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100
-  - unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100
  - unittest/trt/model/eagle # 1 mins on H100
  - unittest/test_model_runner_cpp.py
  - test_cache.py::test_cache_sanity # 1 sec
  - unittest/llmapi/test_llm_quant.py # 5.5 mins on H100
  - test_e2e.py::test_mistral_large_hidden_vocab_size
  - llmapi/test_llm_examples.py::test_llmapi_quickstart_atexit
-  - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
-  - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
  - unittest/trt/attention/test_gpt_attention_IFB.py
-  - unittest/trt/attention/test_gpt_attention_no_cache.py
-  - unittest/trt/model/test_mamba.py # 3 mins
-  - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
-  - examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
  - accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized
  - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8
  - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_plugin
  - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_swiglu_plugin
-  - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin
  - examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
-  - examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
-  - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
-  - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
  - examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-disable_fp8] # 4 mins
-  - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
-  - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
 - condition:
    ranges:
      system_gpu_count:
@ -300,3 +278,25 @@ l0_h100:
  - test_e2e.py::test_build_time_benchmark_sanity
  - accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype
  - accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype
+  - unittest/trt/model/test_mamba.py # 3 mins
+  - examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
+  - examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
+  - unittest/trt/model_api/test_model_level_api.py # 9 mins on H100
+  - unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100
+  - accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin
+  - examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
+  - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
+  - examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
+  - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
+  - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
+  - test_e2e.py::test_trtllm_bench_sanity[-extra_config-non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
+  - test_e2e.py::test_trtllm_bench_sanity[--non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
+  - test_e2e.py::test_trtllm_bench_iteration_log[TRT-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
+  - accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive # 6 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] # 5 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] # 5 mins
+  - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
+  - examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
+  - examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
+  - unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100
+  - unittest/trt/attention/test_gpt_attention_no_cache.py