mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Merge a8813edb44 into 6df2c8a074
This commit is contained in:
commit
c9b01f8278
@ -3267,12 +3267,9 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
|
||||
// PerfSanity post-merge tests
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
|
||||
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
|
||||
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
|
||||
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
|
||||
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8-lbd", "l0_dgx_b200_perf_sanity", 1, 3, 8],
|
||||
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8-lbd", "l0_dgx_b200_perf_sanity", 2, 3, 8],
|
||||
"DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8-lbd", "l0_dgx_b200_perf_sanity", 3, 3, 8],
|
||||
]
|
||||
fullSet += x86SlurmTestConfigs.keySet()
|
||||
|
||||
@ -3306,8 +3303,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
|
||||
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
|
||||
// PerfSanity pre-merge tests
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
|
||||
// PerfSanity post-merge tests
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 3, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 3, 4],
|
||||
@ -3324,8 +3320,12 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
|
||||
// PerfSanity post-merge tests
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
|
||||
// Multi-Node perf tests run one pytest per stage to avoid OOM.
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 3, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-4": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 4, 5, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-5": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 5, 5, 8, 2],
|
||||
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
|
||||
|
||||
@ -54,7 +54,9 @@ MODEL_PATH_DICT = {
|
||||
"deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
|
||||
"deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
|
||||
"deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
|
||||
"deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2",
|
||||
"gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
|
||||
"k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4",
|
||||
}
|
||||
|
||||
SUPPORTED_GPU_TYPE = [
|
||||
@ -852,19 +854,24 @@ class PerfSanityTestConfig:
|
||||
|
||||
def get_gpu_type() -> str:
|
||||
try:
|
||||
# GB200 uses dgx_b200 for wrongly adding dgx_b200 to opensearch in the past.
|
||||
mapping = {
|
||||
"GB200": "dgx_b200",
|
||||
"GB300": "gb300",
|
||||
"B200": "b200",
|
||||
"B300": "b300",
|
||||
}
|
||||
output = subprocess.check_output(
|
||||
["nvidia-smi", "-L"], stderr=subprocess.DEVNULL, text=True
|
||||
"nvidia-smi -q | grep 'Product Name' | head -1",
|
||||
shell=True,
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
first_line = output.strip().split("\n")[0]
|
||||
gpu_models = SUPPORTED_GPU_TYPE
|
||||
for model in gpu_models:
|
||||
if model in first_line:
|
||||
if model.startswith("B") and not model.startswith("GB"):
|
||||
return f"dgx_{model.lower()}"
|
||||
return model.lower()
|
||||
model = output.split()[-1]
|
||||
return mapping.get(model, "unsupported")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, IndexError):
|
||||
print_error("Failed to get GPU type")
|
||||
return ""
|
||||
return "unsupported"
|
||||
|
||||
assert len(labels) > 1, "perf_sanity test must have a config file!"
|
||||
is_disagg = "disagg" in labels[0]
|
||||
|
||||
@ -15,25 +15,21 @@ l0_dgx_b200_perf_sanity:
|
||||
backend: pytorch
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
||||
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
lte: 4
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*b200*'
|
||||
linux_distribution_name: ubuntu*
|
||||
cpu: x86_64
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
||||
# deepseek_r1_fp8_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_8k1k] TIMEOUT (60)
|
||||
# deepseek_r1_fp4_v2_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep8_mtp1_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep8_mtp1_8k1k] TIMEOUT (60)
|
||||
# deepseek_v32_fp4_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_tep8_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_blackwell-v32_fp4_dep8_mtp1_8k1k] TIMEOUT (60)
|
||||
# k2_thinking_fp4_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_blackwell-k2_thinking_fp4_tep8_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_blackwell-k2_thinking_fp4_dep8_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_blackwell-k2_thinking_fp4_tep8_32k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_blackwell-k2_thinking_fp4_dep8_32k8k] TIMEOUT (90)
|
||||
|
||||
@ -14,12 +14,14 @@ l0_gb200_multi_gpus_perf_sanity:
|
||||
stage: pre_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
# deepseek_r1_fp4_v2_grace_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
|
||||
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -34,11 +36,22 @@ l0_gb200_multi_gpus_perf_sanity:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
|
||||
# deepseek_r1_fp4_v2_grace_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (6060)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)
|
||||
# deepseek_v32_fp4_grace_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_dep4_mtp1_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_8k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_dep4_mtp1_8k1k] TIMEOUT (60)
|
||||
# k2_thinking_fp4_grace_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_grace_blackwell-k2_thinking_fp4_tep4_8k1k] TIMEOUT (90)
|
||||
# k2_thinking_fp4_grace_blackwell - DEP4 with CUTLASS (hang)
|
||||
# - perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_grace_blackwell-k2_thinking_fp4_dep4_8k1k] TIMEOUT (90)
|
||||
# gpt_oss_120b_fp4_grace_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (60)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (60)
|
||||
|
||||
@ -13,5 +13,10 @@ l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1] TIMEOUT (90)
|
||||
# deepseek_r1_fp4_v2_2_nodes_grace_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1_1k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1_8k1k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] TIMEOUT (90)
|
||||
# k2_thinking_fp4_2_nodes_grace_blackwell
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_tep8_32k8k] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_dep8_32k8k] TIMEOUT (90)
|
||||
|
||||
@ -2,18 +2,18 @@ metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
supported_gpus:
|
||||
- GB200
|
||||
- GB300
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
server_configs:
|
||||
- name: "r1_fp4_v2_dep8_mtp1"
|
||||
# 1k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep8_mtp1_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
trust_remote_code: true
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 3136
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
@ -28,16 +28,56 @@ server_configs:
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.5
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con32_iter12_1k1k"
|
||||
concurrency: 32
|
||||
iterations: 12
|
||||
- name: "con1024_iter10_1k1k"
|
||||
concurrency: 1024
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
trust_remote_code: true
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con1024_iter10_8k1k"
|
||||
concurrency: 1024
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 1k1k configs - TEP8 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep8_mtp3"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
trust_remote_code: true
|
||||
|
||||
@ -2,72 +2,8 @@ metadata:
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
supported_gpus:
|
||||
- B200
|
||||
- B300
|
||||
server_configs:
|
||||
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con2048_iter5_1k1k"
|
||||
concurrency: 2048
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con32_iter10_1k1k"
|
||||
concurrency: 32
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
# 1k1k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -90,10 +26,111 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con4_iter10_1k1k"
|
||||
concurrency: 4
|
||||
- name: "con2_iter10_1k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 1k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep8_mtp1_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con1024_iter10_1k1k"
|
||||
concurrency: 1024
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 4
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con4_iter10_8k1k"
|
||||
concurrency: 4
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con256_iter10_8k1k"
|
||||
concurrency: 256
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
@ -4,7 +4,7 @@ metadata:
|
||||
- GB200
|
||||
- GB300
|
||||
server_configs:
|
||||
# 1k1k configs
|
||||
# 1k1k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -38,7 +38,15 @@ server_configs:
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
- name: "con1024_iter10_1k1k"
|
||||
concurrency: 1024
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 1k1k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -69,6 +77,7 @@ server_configs:
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
# 1k1k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -98,8 +107,15 @@ server_configs:
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
- name: "con2_iter10_1k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs
|
||||
# 8k1k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep4_mtp1_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -133,7 +149,15 @@ server_configs:
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
- name: "con256_iter10_8k1k"
|
||||
concurrency: 256
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -164,6 +188,7 @@ server_configs:
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -193,8 +218,15 @@ server_configs:
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
- name: "con2_iter10_8k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 1k8k configs
|
||||
# 1k8k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "r1_fp4_v2_dep4_mtp1_1k8k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -229,6 +261,7 @@ server_configs:
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
# 1k8k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k8k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
@ -259,6 +292,7 @@ server_configs:
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
# 1k8k configs - TP4 with TRTLLM, MTP3
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k8k"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
tensor_parallel_size: 4
|
||||
|
||||
@ -2,8 +2,39 @@ metadata:
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
supported_gpus:
|
||||
- B200
|
||||
- B300
|
||||
server_configs:
|
||||
# 1k1k configs - TP8 with TRTLLM, MTP3
|
||||
- name: "r1_fp8_tp8_mtp3_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp8"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 8
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con4_iter10_1k1k"
|
||||
concurrency: 4
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 1k1k configs - DEP8 with DEEPGEMM, MTP1
|
||||
- name: "r1_fp8_dep8_mtp1_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp8"
|
||||
tensor_parallel_size: 8
|
||||
@ -30,51 +61,22 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con4096_iter5_1k1k"
|
||||
concurrency: 4096
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp8_tep8_mtp3_1k1k"
|
||||
model_name: "deepseek_r1_0528_fp8"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 64
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'DEEPGEMM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 64
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con64_iter10_1k1k"
|
||||
concurrency: 64
|
||||
- name: "con1024_iter10_1k1k"
|
||||
concurrency: 1024
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp8_tp8_mtp3_1k1k"
|
||||
# 8k1k configs - TP8 with TRTLLM, MTP3
|
||||
- name: "r1_fp8_tp8_mtp3_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp8"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 8192
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
@ -90,10 +92,45 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con8_iter10_1k1k"
|
||||
concurrency: 8
|
||||
- name: "con4_iter10_8k1k"
|
||||
concurrency: 4
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.8
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - DEP8 with DEEPGEMM, MTP1
|
||||
- name: "r1_fp8_dep8_mtp1_8k1k"
|
||||
model_name: "deepseek_r1_0528_fp8"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'DEEPGEMM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con256_iter10_8k1k"
|
||||
concurrency: 256
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
70
tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml
Normal file
70
tests/scripts/perf-sanity/deepseek_v32_fp4_blackwell.yaml
Normal file
@ -0,0 +1,70 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
supported_gpus:
|
||||
- B200
|
||||
server_configs:
|
||||
# 8k1k configs - TEP8 with TRTLLM, MTP3
|
||||
- name: "v32_fp4_tep8_mtp3_8k1k"
|
||||
model_name: "deepseek_v32_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con2_iter10_8k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS, MTP1
|
||||
- name: "v32_fp4_dep8_mtp1_8k1k"
|
||||
model_name: "deepseek_v32_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con256_iter10_8k1k"
|
||||
concurrency: 256
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
136
tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml
Normal file
136
tests/scripts/perf-sanity/deepseek_v32_fp4_grace_blackwell.yaml
Normal file
@ -0,0 +1,136 @@
|
||||
metadata:
|
||||
model_name: deepseek_v32_fp4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
server_configs:
|
||||
# 1k1k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "v32_fp4_tep4_mtp3_1k1k"
|
||||
model_name: "deepseek_v32_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con2_iter10_1k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 1k1k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "v32_fp4_dep4_mtp1_1k1k"
|
||||
model_name: "deepseek_v32_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con1024_iter10_1k1k"
|
||||
concurrency: 1024
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - TEP4 with TRTLLM, MTP3
|
||||
- name: "v32_fp4_tep4_mtp3_8k1k"
|
||||
model_name: "deepseek_v32_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 3
|
||||
client_configs:
|
||||
- name: "con2_iter10_8k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - DEP4 with CUTLASS, MTP1
|
||||
- name: "v32_fp4_dep4_mtp1_8k1k"
|
||||
model_name: "deepseek_v32_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
speculative_config:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con256_iter10_8k1k"
|
||||
concurrency: 256
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
@ -0,0 +1,68 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
hardware:
|
||||
gpus_per_node: 4
|
||||
server_configs:
|
||||
# 32k8k configs - TEP8 with TRTLLM
|
||||
- name: "k2_thinking_fp4_tep8_32k8k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_chunked_prefill: true
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con2_iter10_32k8k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 32k8k configs - DEP8 with CUTLASS
|
||||
- name: "k2_thinking_fp4_dep8_32k8k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 256
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_chunked_prefill: true
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 256
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con128_iter10_32k8k"
|
||||
concurrency: 128
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
126
tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml
Normal file
126
tests/scripts/perf-sanity/k2_thinking_fp4_blackwell.yaml
Normal file
@ -0,0 +1,126 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
supported_gpus:
|
||||
- B200
|
||||
server_configs:
|
||||
# 8k1k configs - TEP8 with TRTLLM
|
||||
- name: "k2_thinking_fp4_tep8_8k1k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con2_iter10_8k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - DEP8 with CUTLASS
|
||||
- name: "k2_thinking_fp4_dep8_8k1k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con512_iter10_8k1k"
|
||||
concurrency: 512
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 32k8k configs - TEP8 with TRTLLM
|
||||
- name: "k2_thinking_fp4_tep8_32k8k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_chunked_prefill: true
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con2_iter10_32k8k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 32k8k configs - DEP8 with CUTLASS
|
||||
- name: "k2_thinking_fp4_dep8_32k8k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 8
|
||||
moe_expert_parallel_size: 8
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 256
|
||||
max_num_tokens: 8192
|
||||
attn_backend: "TRTLLM"
|
||||
enable_chunked_prefill: true
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 256
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con128_iter10_32k8k"
|
||||
concurrency: 128
|
||||
iterations: 10
|
||||
isl: 32768
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
@ -0,0 +1,64 @@
|
||||
metadata:
|
||||
model_name: k2_thinking_fp4
|
||||
supported_gpus:
|
||||
- GB200
|
||||
server_configs:
|
||||
# 8k1k configs - TEP4 with TRTLLM
|
||||
- name: "k2_thinking_fp4_tep4_8k1k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 32
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con2_iter10_8k1k"
|
||||
concurrency: 2
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs - DEP4 with CUTLASS (hang)
|
||||
- name: "k2_thinking_fp4_dep4_8k1k"
|
||||
model_name: "k2_thinking_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_num_tokens: 12288
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
batching_wait_iters: 0
|
||||
enable_balance: true
|
||||
timeout_iters: 60
|
||||
moe_config:
|
||||
backend: 'CUTLASS'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
client_configs:
|
||||
- name: "con256_iter10_8k1k"
|
||||
concurrency: 256
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
backend: "openai"
|
||||
Loading…
Reference in New Issue
Block a user