mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][test] Remove most TRT-backend test cases in llm_perf_nim.yml (#10572)
Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com> Signed-off-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com> Co-authored-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com>
This commit is contained in:
parent
c5914f9085
commit
8e806abac3
@ -9,7 +9,6 @@ llm_perf_sanity:
|
||||
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
|
||||
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 6: H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
# ===============================================================================
|
||||
|
||||
# 1: All GPUs
|
||||
@ -31,6 +30,7 @@ llm_perf_sanity:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:500,2000]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[starcoder2_7b-bench-pytorch-bfloat16-input_output_len:512,512]
|
||||
# Phi-4-multimodal-instruct
|
||||
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
# Bielik-11B-v2.2-Instruct
|
||||
@ -124,25 +124,9 @@ llm_perf_sanity:
|
||||
# for chunked prefill cases
|
||||
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
|
||||
# disagg server cases
|
||||
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
# gpt_oss_20b_fp4
|
||||
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
|
||||
|
||||
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
compute_capability:
|
||||
gte: 9.0
|
||||
lt: 12.0
|
||||
|
||||
tests:
|
||||
# chunked attention case
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user