mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
tests: cherry-pick from main branch, add qwen3 test cases and amend test name in perf test (#5357)
Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com>
This commit is contained in:
parent
decfe2fdb3
commit
e87cf62c12
@ -101,6 +101,26 @@ def get_model_yaml_config(model_label: str,
|
||||
'enable_attention_dp': False,
|
||||
}
|
||||
},
|
||||
# Qwen3 models with fp4 quantization on B200 and fp8 quantization on H200/H20
|
||||
{
|
||||
'patterns': [
|
||||
'qwen3_235b_a22b_fp4-bench-pytorch-float4-maxbs:512-maxnt:2048-input_output_len:1000,2000-con:512-ep:4-gpus:4',
|
||||
'qwen3_235b_a22b_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:1000,2000-con:256-ep:8-gpus:8'
|
||||
],
|
||||
'config': {
|
||||
'enable_attention_dp': True,
|
||||
}
|
||||
},
|
||||
# Qwen3 models with fp4 quantization on B200 with moe backend equal to TRTLLM
|
||||
{
|
||||
'patterns': [
|
||||
'qwen3_235b_a22b_fp4-bench-pytorch-float4-maxbs:512-maxnt:2048-input_output_len:1000,2000-con:8-ep:8-gpus:8',
|
||||
],
|
||||
'config': {
|
||||
'enable_attention_dp': False,
|
||||
'moe_backend': 'TRTLLM'
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# Apply pattern-based configurations on top of base config
|
||||
|
||||
@ -84,12 +84,8 @@ MODEL_PATH_DICT = {
|
||||
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
|
||||
"qwen2_7b_instruct": "Qwen2-7B-Instruct",
|
||||
"qwen_14b_chat": "Qwen-14B-Chat",
|
||||
"qwen3_8b": "Qwen3-8B",
|
||||
"qwen3_8b_fp8": "Qwen3-8B-FP8",
|
||||
"qwen3_30b_a3b": "Qwen3-30B-A3B",
|
||||
"qwen3_30b_a3b_fp8": "Qwen3-30B-A3B-FP8",
|
||||
"qwen3_235b_a22b": "Qwen3-235B-A22B",
|
||||
"qwen3_235b_a22b_fp8": "Qwen3-235B-A22B-FP8",
|
||||
"qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
|
||||
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
|
||||
"starcoder2_3b": "starcoder2-3b",
|
||||
"starcoder_15b": "starcoder2-15b",
|
||||
"t5": "t5-small", # not supported for trtllm-bench build config
|
||||
|
||||
@ -5,14 +5,14 @@ trt_llm_release_perf_cluster_test:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[llama_v3_8b_instruct-bench-bfloat16-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[llama_v3_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8]
|
||||
- perf/test_perf.py::test_perf[llama_v3_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[llama_v3_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500]
|
||||
- perf/test_perf.py::test_perf[llama_v3_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500]
|
||||
- perf/test_perf.py::test_perf[llama_v3_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000]
|
||||
- perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20]
|
||||
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8]
|
||||
- perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8]
|
||||
@ -42,6 +42,9 @@ trt_llm_release_perf_cluster_test:
|
||||
- perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
|
||||
- perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:512,32-gpus:4]
|
||||
- perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4]
|
||||
#- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
|
||||
|
||||
# Tests for systems with 8+ GPUs
|
||||
@ -69,7 +72,8 @@ trt_llm_release_perf_cluster_test:
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40)
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40)
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-streaming-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40)
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8]
|
||||
|
||||
@ -99,7 +99,7 @@ trt_llm_release_perf_sanity_test:
|
||||
- perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
|
||||
- perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpu:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
|
||||
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2]
|
||||
|
||||
@ -376,7 +376,7 @@ trt_llm_release_perf_test:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
gpu_memory:
|
||||
gt: 100000
|
||||
gt: 80000
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*h200*'
|
||||
@ -390,6 +390,7 @@ trt_llm_release_perf_test:
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(45)
|
||||
|
||||
# FP8 specific tests
|
||||
- condition:
|
||||
|
||||
@ -395,6 +395,10 @@ perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-exe-plugin_ifb-float16-inp
|
||||
perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
|
||||
perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
|
||||
perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-float16-input_output_len:512,200] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
|
||||
perf/test_perf.py::test_perf[bart_large_cnn-bench-float16-input_output_len:128,20] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
|
||||
perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
|
||||
perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
|
||||
perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5303573)
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5303573)
|
||||
test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5236980)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user