mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-9932][test] add kimi_k2 single node perf test (#10436)
Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com> Co-authored-by: Ruodi Lu <ruodil@users.noreply.github.com>
This commit is contained in:
parent
4632a8642d
commit
2b72d33fdc
@ -59,7 +59,7 @@ def get_model_yaml_config(model_label: str,
|
|||||||
pattern_configs = [
|
pattern_configs = [
|
||||||
# Deepseek default cases
|
# Deepseek default cases
|
||||||
{
|
{
|
||||||
'patterns': 'deepseek_r1',
|
'patterns': ['deepseek_r1', 'kimi_k2_nvfp4'],
|
||||||
'config': {
|
'config': {
|
||||||
'enable_attention_dp': True,
|
'enable_attention_dp': True,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -144,6 +144,7 @@ MODEL_PATH_DICT = {
|
|||||||
"gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b",
|
"gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b",
|
||||||
"nemotron_nano_9b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
|
"nemotron_nano_9b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
|
||||||
"starcoder2_7b": "starcoder2-7b",
|
"starcoder2_7b": "starcoder2-7b",
|
||||||
|
"kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4",
|
||||||
}
|
}
|
||||||
# Model PATH of HuggingFace
|
# Model PATH of HuggingFace
|
||||||
HF_MODEL_PATH = {
|
HF_MODEL_PATH = {
|
||||||
|
|||||||
@ -14,10 +14,11 @@ llm_perf_core:
|
|||||||
# 9: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
|
# 9: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
|
||||||
# 10: GB200, B200, B300, GB300, RTX6000-Server test cases
|
# 10: GB200, B200, B300, GB300, RTX6000-Server test cases
|
||||||
# 11: B200, GB200, B300, GB300 test cases
|
# 11: B200, GB200, B300, GB300 test cases
|
||||||
# 12: H100, H20, H200, B200, B300 test cases
|
# 12: B200, B300 test cases
|
||||||
# 13: H100, H20, H200, B200, B300, RTX-6000 Server test cases
|
# 13: H100, H20, H200, B200, B300 test cases
|
||||||
# 14: RTX-6000D, RTX-6000 Server test cases
|
# 14: H100, H20, H200, B200, B300, RTX-6000 Server test cases
|
||||||
# 15: RTX6000-Server test cases
|
# 15: RTX-6000D, RTX-6000 Server test cases
|
||||||
|
# 16: RTX6000-Server test cases
|
||||||
# ===============================================================================
|
# ===============================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -289,7 +290,21 @@ llm_perf_core:
|
|||||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120)
|
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120)
|
||||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:32-maxnt:32768-input_output_len:8192,1024-reqs:20-con:1-ep:1-tp:4-gpus:4] TIMEOUT(120)
|
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:32-maxnt:32768-input_output_len:8192,1024-reqs:20-con:1-ep:1-tp:4-gpus:4] TIMEOUT(120)
|
||||||
|
|
||||||
# 12: H100, H20, H200, B200, B300 test cases
|
# 12: B200, B300 test cases
|
||||||
|
- condition:
|
||||||
|
ranges:
|
||||||
|
system_gpu_count:
|
||||||
|
gte: 8
|
||||||
|
compute_capability:
|
||||||
|
gte: 10.0
|
||||||
|
lte: 10.3
|
||||||
|
tests:
|
||||||
|
- perf/test_perf.py::test_perf[kimi_k2_nvfp4-bench-pytorch-float4-maxbs:16-input_output_len:128,128-reqs:20-con:1-ep:8-tp:8-gpus:8]
|
||||||
|
- perf/test_perf.py::test_perf[kimi_k2_nvfp4-bench-pytorch-float4-maxbs:256-input_output_len:2000,500-ep:8-tp:8-gpus:8]
|
||||||
|
- perf/test_perf.py::test_perf[kimi_k2_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.6-input_output_len:1000,1000-ep:8-tp:8-gpus:8]
|
||||||
|
- perf/test_perf.py::test_perf[kimi_k2_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.6-input_output_len:5000,500-reqs:2000-ep:8-tp:8-gpus:8] TIMEOUT(120)
|
||||||
|
|
||||||
|
# 13: H100, H20, H200, B200, B300 test cases
|
||||||
- condition:
|
- condition:
|
||||||
ranges:
|
ranges:
|
||||||
system_gpu_count:
|
system_gpu_count:
|
||||||
@ -356,7 +371,7 @@ llm_perf_core:
|
|||||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
|
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
|
||||||
|
|
||||||
|
|
||||||
# 13: H100, H20, H200, B200, B300, RTX-6000 Server test cases
|
# 14: H100, H20, H200, B200, B300, RTX-6000 Server test cases
|
||||||
- condition:
|
- condition:
|
||||||
ranges:
|
ranges:
|
||||||
system_gpu_count:
|
system_gpu_count:
|
||||||
@ -368,7 +383,7 @@ llm_perf_core:
|
|||||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||||
|
|
||||||
|
|
||||||
# 14: RTX-6000D, RTX-6000 Server test cases
|
# 15: RTX-6000D, RTX-6000 Server test cases
|
||||||
- condition:
|
- condition:
|
||||||
ranges:
|
ranges:
|
||||||
system_gpu_count:
|
system_gpu_count:
|
||||||
@ -402,7 +417,7 @@ llm_perf_core:
|
|||||||
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8-tp:2-gpus:2]
|
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8-tp:2-gpus:2]
|
||||||
|
|
||||||
|
|
||||||
# 15: RTX6000-Server test cases
|
# 16: RTX6000-Server test cases
|
||||||
- condition:
|
- condition:
|
||||||
ranges:
|
ranges:
|
||||||
system_gpu_count:
|
system_gpu_count:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user