mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
move some test cases of TensorRT backend back (#5232)
Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
parent
faca19c2f0
commit
517c1ecf72
@ -1527,6 +1527,12 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
|
||||
"A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
|
||||
"A10-CPP-1": ["a10", "l0_a10", 1, 1],
|
||||
"A10-TensorRT-1": ["a10", "l0_a10", 1, 6],
|
||||
"A10-TensorRT-2": ["a10", "l0_a10", 2, 6],
|
||||
"A10-TensorRT-3": ["a10", "l0_a10", 3, 6],
|
||||
"A10-TensorRT-4": ["a10", "l0_a10", 4, 6],
|
||||
"A10-TensorRT-5": ["a10", "l0_a10", 5, 6],
|
||||
"A10-TensorRT-6": ["a10", "l0_a10", 6, 6],
|
||||
"A30-Triton-1": ["a30", "l0_a30", 1, 1],
|
||||
"A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
|
||||
"A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
|
||||
@ -1538,19 +1544,19 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
"H100_PCIe-PyTorch-2": ["h100-cr", "l0_h100", 2, 3],
|
||||
"H100_PCIe-PyTorch-3": ["h100-cr", "l0_h100", 3, 3],
|
||||
"H100_PCIe-CPP-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 2],
|
||||
"H100_PCIe-TensorRT-2": ["h100-cr", "l0_h100", 2, 2],
|
||||
"B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 2],
|
||||
"B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 2],
|
||||
"B200_PCIe-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2],
|
||||
"B200_PCIe-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2],
|
||||
"RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1],
|
||||
"RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2],
|
||||
"RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2],
|
||||
// Currently post-merge test stages only run tests with "stage: post_merge" mako
|
||||
// in the test-db. This behavior may change in the future.
|
||||
"A10-TensorRT-[Post-Merge]-1": ["a10", "l0_a10", 1, 8],
|
||||
"A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 8],
|
||||
"A10-TensorRT-[Post-Merge]-3": ["a10", "l0_a10", 3, 8],
|
||||
"A10-TensorRT-[Post-Merge]-4": ["a10", "l0_a10", 4, 8],
|
||||
"A10-TensorRT-[Post-Merge]-5": ["a10", "l0_a10", 5, 8],
|
||||
"A10-TensorRT-[Post-Merge]-6": ["a10", "l0_a10", 6, 8],
|
||||
"A10-TensorRT-[Post-Merge]-7": ["a10", "l0_a10", 7, 8],
|
||||
"A10-TensorRT-[Post-Merge]-8": ["a10", "l0_a10", 8, 8],
|
||||
"A10-TensorRT-[Post-Merge]-1": ["a10", "l0_a10", 1, 2],
|
||||
"A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2],
|
||||
"A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 6],
|
||||
"A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 6],
|
||||
"A30-TensorRT-[Post-Merge]-3": ["a30", "l0_a30", 3, 6],
|
||||
@ -1575,18 +1581,12 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
"L40S-TensorRT-[Post-Merge]-5": ["l40s", "l0_l40s", 5, 5],
|
||||
"H100_PCIe-PyTorch-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-CPP-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 7],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 7],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-3": ["h100-cr", "l0_h100", 3, 7],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-4": ["h100-cr", "l0_h100", 4, 7],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-5": ["h100-cr", "l0_h100", 5, 7],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-6": ["h100-cr", "l0_h100", 6, 7],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-7": ["h100-cr", "l0_h100", 7, 7],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 5],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 5],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-3": ["h100-cr", "l0_h100", 3, 5],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-4": ["h100-cr", "l0_h100", 4, 5],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-5": ["h100-cr", "l0_h100", 5, 5],
|
||||
"B200_PCIe-Triton-Python-[Post-Merge]-1": ["b100-ts2", "l0_b200", 1, 1],
|
||||
"B200_PCIe-[Post-Merge]-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2],
|
||||
"B200_PCIe-[Post-Merge]-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2],
|
||||
"RTX5080-[Post-Merge]-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2],
|
||||
"RTX5080-[Post-Merge]-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2],
|
||||
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
|
||||
"H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
|
||||
"DGX_H200-8_GPUs-PyTorch-[Post-Merge]-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
|
||||
|
||||
@ -51,7 +51,7 @@ l0_a10:
|
||||
- '*a10*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: post_merge
|
||||
stage: pre_merge
|
||||
backend: tensorrt
|
||||
tests:
|
||||
# ------------- TRT tests ---------------
|
||||
|
||||
@ -64,7 +64,7 @@ l0_b200:
|
||||
- '*b100*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: post_merge
|
||||
stage: pre_merge
|
||||
backend: tensorrt
|
||||
tests:
|
||||
# ------------- TRT tests ---------------
|
||||
|
||||
@ -10,7 +10,7 @@ l0_gb203:
|
||||
- '*gb203*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: post_merge
|
||||
stage: pre_merge
|
||||
backend: tensorrt
|
||||
tests:
|
||||
# ------------- TRT tests ---------------
|
||||
|
||||
@ -120,7 +120,7 @@ l0_h100:
|
||||
- '*h100*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: post_merge
|
||||
stage: pre_merge
|
||||
backend: tensorrt
|
||||
tests:
|
||||
# ------------- TRT tests ---------------
|
||||
@ -129,52 +129,30 @@ l0_h100:
|
||||
- unittest/trt/quantization/test_weight_only_quant_matmul.py
|
||||
- unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py
|
||||
- test_e2e.py::test_trtllm_bench_sanity[-extra_config-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_sanity[-extra_config-non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_sanity[--streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_sanity[--non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_latency_sanity[FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]
|
||||
- test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test
|
||||
- test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_iteration_log[TRT-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B]
|
||||
- accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive # 6 mins
|
||||
- accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] # 5 mins
|
||||
- accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] # 5 mins
|
||||
- examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b]
|
||||
- examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b]
|
||||
- examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct]
|
||||
- examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it]
|
||||
- examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8]
|
||||
- examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
|
||||
- unittest/trt/model_api/test_model_level_api.py # 9 mins on H100
|
||||
- unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100
|
||||
- unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100
|
||||
- unittest/trt/model/eagle # 1 mins on H100
|
||||
- unittest/test_model_runner_cpp.py
|
||||
- test_cache.py::test_cache_sanity # 1 sec
|
||||
- unittest/llmapi/test_llm_quant.py # 5.5 mins on H100
|
||||
- test_e2e.py::test_mistral_large_hidden_vocab_size
|
||||
- llmapi/test_llm_examples.py::test_llmapi_quickstart_atexit
|
||||
- examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
|
||||
- examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
|
||||
- unittest/trt/attention/test_gpt_attention_IFB.py
|
||||
- unittest/trt/attention/test_gpt_attention_no_cache.py
|
||||
- unittest/trt/model/test_mamba.py # 3 mins
|
||||
- examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
|
||||
- examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
|
||||
- accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized
|
||||
- accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8
|
||||
- accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_plugin
|
||||
- accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_swiglu_plugin
|
||||
- accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
|
||||
- examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-disable_fp8] # 4 mins
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -300,3 +278,25 @@ l0_h100:
|
||||
- test_e2e.py::test_build_time_benchmark_sanity
|
||||
- accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype
|
||||
- accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype
|
||||
- unittest/trt/model/test_mamba.py # 3 mins
|
||||
- examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
|
||||
- examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
|
||||
- unittest/trt/model_api/test_model_level_api.py # 9 mins on H100
|
||||
- unittest/trt/model_api/test_model_api_multi_gpu.py # 0.5 mins on H100
|
||||
- accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin
|
||||
- examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
|
||||
- examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
|
||||
- examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
|
||||
- test_e2e.py::test_trtllm_bench_sanity[-extra_config-non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_sanity[--non-streaming-FP16-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_iteration_log[TRT-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive # 6 mins
|
||||
- accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] # 5 mins
|
||||
- accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] # 5 mins
|
||||
- examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
|
||||
- examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1]
|
||||
- examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
|
||||
- unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100
|
||||
- unittest/trt/attention/test_gpt_attention_no_cache.py
|
||||
|
||||
Loading…
Reference in New Issue
Block a user