[TRTLLM-8263][feat] Add Disagg Perf Tests (#10912)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
chenfeiz0326 2026-02-04 10:16:11 +08:00 committed by GitHub
parent 588db0ed64
commit 04b7db3ab5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
65 changed files with 3779 additions and 227 deletions

View File

@ -918,7 +918,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Create a unique suffix for the job name
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
def disaggMode = stageName.contains("PerfSanity-Disagg")
def disaggMode = stageName.contains("Disagg-PerfSanity")
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@ -3151,6 +3151,15 @@ def runInKubernetes(pipeline, podSpec, containerName)
}
}
def buildStageConfigs(stageName, platform, testlist, testCount, gpuCount, nodeCount, runWithSbatch=false) {
def configs = [:]
for (int k = 1; k <= testCount; k++) {
def key = "${stageName}-${k}"
configs[key] = [platform, testlist, k, testCount, gpuCount, nodeCount, runWithSbatch]
}
return configs
}
def launchTestJobs(pipeline, testFilter)
{
// IMPORTANT: Stage Configuration Syntax Requirement
@ -3354,18 +3363,57 @@ def launchTestJobs(pipeline, testFilter)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes", 3, 3, 8, 2],
// PerfSanity post-merge tests
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-3": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 3, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-4": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 4, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-5": ["auto:gb200-flex", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 5, 5, 8, 2],
// Disable stage 'GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1' due to https://nvbugs/5819053
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["auto:gb200-flex", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
]
// PerfSanity post-merge aggr tests
multiNodesSBSAConfigs += buildStageConfigs(
"GB200-8_GPUs-2_Nodes-PyTorch-Aggr-PerfSanity-Node2-GPU8-Post-Merge",
"auto:gb200-flex",
"l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8",
5,
8,
2
)
// PerfSanity post-merge disagg tests
multiNodesSBSAConfigs += buildStageConfigs(
"GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE1-GPU4-Post-Merge",
"auto:gb200-flex",
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4",
1,
8,
2
)
multiNodesSBSAConfigs += buildStageConfigs(
"GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE1-GPU4-Post-Merge",
"auto:gb200-flex",
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4",
3,
8,
2
)
multiNodesSBSAConfigs += buildStageConfigs(
"GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE2-GPU8-Post-Merge",
"auto:gb200-flex",
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8",
1,
12,
3
)
multiNodesSBSAConfigs += buildStageConfigs(
"GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE2-GPU8-Post-Merge",
"auto:gb200-flex",
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8",
5,
12,
3
)
multiNodesSBSAConfigs += buildStageConfigs(
"GB200-16_GPUs-4_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE2-GPU8-GEN1-NODE2-GPU8-Post-Merge",
"auto:gb200-flex",
"l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8",
1,
16,
4
)
fullSet += multiNodesSBSAConfigs.keySet()
if (env.targetArch == AARCH64_TRIPLE) {
@ -3610,9 +3658,9 @@ def launchTestJobs(pipeline, testFilter)
}, {}, true)
}]}
multiGpuJobs = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && !it.key.contains("Post-Merge")}
multiGpuJobs = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && !it.key.contains("Post-Merge")}
println multiGpuJobs.keySet()
multiGpuJobsPostMerge = parallelJobs.findAll{(it.key.contains("2_GPUs") || it.key.contains("4_GPUs") || it.key.contains("8_GPUs")) && it.key.contains("Post-Merge")}
multiGpuJobsPostMerge = parallelJobs.findAll{(it.key =~ /\d+_GPUs/) && it.key.contains("Post-Merge")}
parallelJobs += docBuildJobs
parallelJobs += sanityCheckJobs
@ -3927,9 +3975,9 @@ pipeline {
def testPhase2StageName = env.testPhase2StageName
if (testPhase2StageName) {
def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"]
singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
def multiGpuPattern = /\d+_GPUs/
singleGpuJobs = parallelJobs.findAll{!(it.key =~ multiGpuPattern)}
dgxJobs = parallelJobs.findAll{it.key =~ multiGpuPattern}
}
if (env.JOB_NAME ==~ /.*Single-GPU.*/) {

View File

@ -19,13 +19,13 @@ echo "Installation completed on all nodes"
# Start gen servers
echo "Starting gen servers..."
for i in $(seq 0 $((numGenServers - 1))); do
gen_world_size=$((nodesPerGenServer * gpusPerNode))
gen_world_size=$((nodesPerGenServer * gpusPerfNodePerfGenServer))
export DISAGG_SERVING_TYPE="GEN_$i"
export pytestCommand="$pytestCommandWorker"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
-N $nodesPerGenServer \
--ntasks=$gen_world_size \
--ntasks-per-node=$gpusPerNode \
--ntasks-per-node=$gpusPerfNodePerfGenServer \
$runScript &> $jobWorkspace/gen_server_$i.log &
echo "Started gen server $i"
done
@ -34,13 +34,13 @@ done
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
echo "Starting ctx servers..."
for i in $(seq 0 $((numCtxServers - 1))); do
ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
ctx_world_size=$((nodesPerCtxServer * gpusPerfNodePerfCtxServer))
export DISAGG_SERVING_TYPE="CTX_$i"
export pytestCommand="$pytestCommandWorker"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
-N $nodesPerCtxServer \
--ntasks=$ctx_world_size \
--ntasks-per-node=$gpusPerNode \
--ntasks=$ctx_world_size \
--ntasks-per-node=$gpusPerfNodePerfCtxServer \
$runScript &> $jobWorkspace/ctx_server_$i.log &
echo "Started ctx server $i"
done

View File

@ -38,6 +38,9 @@ def get_hardware_config(config, benchmark_mode):
nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node
nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node
gpus_per_node_per_ctx_server = min(gpus_per_ctx_server, gpus_per_node)
gpus_per_node_per_gen_server = min(gpus_per_gen_server, gpus_per_node)
total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server
total_gpus = total_nodes * gpus_per_node
@ -49,6 +52,8 @@ def get_hardware_config(config, benchmark_mode):
"gpus_per_gen_server": gpus_per_gen_server,
"nodes_per_ctx_server": nodes_per_ctx_server,
"nodes_per_gen_server": nodes_per_gen_server,
"gpus_per_node_per_ctx_server": gpus_per_node_per_ctx_server,
"gpus_per_node_per_gen_server": gpus_per_node_per_gen_server,
"total_nodes": total_nodes,
"total_gpus": total_gpus,
}
@ -102,7 +107,14 @@ def remove_whitespace_lines(lines):
return [line.strip() for line in lines if line.strip()]
def get_pytest_command_no_llmapilaunch(script_prefix_lines):
def get_pytest_commands(script_prefix_lines):
# Get worker, disagg_server, benchmark pytest commands from pytest command.
# Worker pytest command is pytest command with trtllm-llmapi-launch and
# without --csv, --cov, --periodic flags.
# Disagg_server pytest command is pytest command without trtllm-llmapi-launch
# and without --csv, --cov, --periodic flags.
# Benchmark pytest command is pytest command without trtllm-llmapi-launch
# and with --csv, --cov, --periodic flags.
pytest_command_line = None
for line in script_prefix_lines:
if "export pytestCommand=" in line:
@ -110,17 +122,102 @@ def get_pytest_command_no_llmapilaunch(script_prefix_lines):
break
if not pytest_command_line:
return ""
return "", "", ""
# Replace pytestCommand with pytestCommandNoLLMAPILaunch
replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch")
def split_pytest_command_line(command_line):
# After pytest, there are six types of substrings:
# Type 1: --xxx=yyy (long option with value, self-contained)
# Type 2: --xxx= (long option with empty value, self-contained)
# Type 3: --xxx (long option flag, no value)
# Type 4: --xxx yyy (long option with value as next arg)
# Type 5: -x yyy (short single-letter option with value as next arg)
# Type 6: -x (short option flag, e.g., -v, -vv)
parts = command_line.split()
pytest_index = None
for idx, part in enumerate(parts):
if "pytest" == part:
pytest_index = idx
break
if pytest_index is None:
return parts
# Split by space, find and remove the substring with trtllm-llmapi-launch
replaced_line_parts = replaced_line.split()
replaced_line_parts_no_llmapi = [
part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part
grouped_parts = parts[: pytest_index + 1]
i = pytest_index + 1
while i < len(parts):
part = parts[i]
has_next = i + 1 < len(parts)
next_is_value = has_next and not parts[i + 1].startswith("-")
# Type 1 & 2: --xxx=yyy or --xxx= (self-contained, has '=')
if part.startswith("--") and "=" in part:
grouped_parts.append(part)
i += 1
continue
# Type 4: --xxx yyy (long option with value as next arg)
if part.startswith("--") and next_is_value:
grouped_parts.append(f"{part} {parts[i + 1]}")
i += 2
continue
# Type 3: --xxx (long option flag)
if part.startswith("--"):
grouped_parts.append(part)
i += 1
continue
# Type 5: -x yyy (short single-letter option with value as next arg)
# Only single letter after dash, e.g., -o, not -vv
if part.startswith("-") and len(part) == 2 and next_is_value:
grouped_parts.append(f"{part} {parts[i + 1]}")
i += 2
continue
# Type 6: -x (short option flag, including combined like -vv)
if part.startswith("-"):
grouped_parts.append(part)
i += 1
continue
# Other parts (shouldn't happen after pytest, but handle gracefully)
grouped_parts.append(part)
i += 1
return grouped_parts
def is_llmapi_launch(part):
return "trtllm-llmapi-launch" in part
def is_output_file_part(part):
return any(flag in part for flag in ("--csv", "--cov", "--periodic"))
worker_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandWorker")
worker_parts = [
part for part in split_pytest_command_line(worker_line) if not is_output_file_part(part)
]
return " ".join(replaced_line_parts_no_llmapi)
worker_pytest_command = " ".join(worker_parts)
disagg_server_line = pytest_command_line.replace(
"pytestCommand", "partialPytestCommandDisaggServer"
)
disagg_server_parts = [
part
for part in split_pytest_command_line(disagg_server_line)
if not is_llmapi_launch(part) and not is_output_file_part(part)
]
disagg_server_pytest_command = " ".join(disagg_server_parts)
benchmark_line = pytest_command_line.replace("pytestCommand", "partialPytestCommandBenchmark")
benchmark_parts = [
part for part in split_pytest_command_line(benchmark_line) if not is_llmapi_launch(part)
]
benchmark_pytest_command = " ".join(benchmark_parts)
return (
worker_pytest_command,
disagg_server_pytest_command,
benchmark_pytest_command,
)
def get_config_yaml(test_list_path, llm_src):
@ -153,7 +250,7 @@ def get_config_yaml(test_list_path, llm_src):
"disagg",
"test_configs",
"disagg",
"perf",
"perf-sanity",
f"{config_base_name}.yaml",
)
if not os.path.exists(config_yaml_path):
@ -225,8 +322,12 @@ def main():
srun_args_lines = srun_args_content.split()
# Extract pytestCommand and generate pytestCommandNoLLMAPILaunch
pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines)
# Extract pytestCommand and generate partial pytest commands
(
worker_pytest_command,
disagg_server_pytest_command,
benchmark_pytest_command,
) = get_pytest_commands(script_prefix_lines)
# Build worker env vars, add extra env vars for gen_only mode
worker_env_vars = env_config["worker_env_var"]
@ -244,12 +345,15 @@ def main():
script_prefix_lines.extend(
[
pytest_command_no_llmapi_launch,
f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"',
f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"',
f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"',
worker_pytest_command,
disagg_server_pytest_command,
benchmark_pytest_command,
f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $partialPytestCommandWorker"',
f'export pytestCommandDisaggServer="{server_env_vars} $partialPytestCommandDisaggServer"',
f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $partialPytestCommandBenchmark"',
f"export runScript={args.run_sh}",
f"export installScript={install_script}",
f"export configYamlPath={config_yaml}",
f"export numCtxServers={hardware_config['num_ctx_servers']}",
f"export numGenServers={hardware_config['num_gen_servers']}",
f"export gpusPerNode={hardware_config['gpus_per_node']}",
@ -257,6 +361,8 @@ def main():
f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}",
f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}",
f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}",
f"export gpusPerfNodePerfCtxServer={hardware_config['gpus_per_node_per_ctx_server']}",
f"export gpusPerfNodePerfGenServer={hardware_config['gpus_per_node_per_gen_server']}",
f"export totalNodes={hardware_config['total_nodes']}",
f"export totalGpus={hardware_config['total_gpus']}",
]

View File

@ -13,8 +13,9 @@ sys.path.insert(0, sys.path[0] + "/..")
from open_search_db import OpenSearchDB
QUERY_LOOKBACK_DAYS = 90
LOOKBACK_JOBS = 30
MAX_QUERY_SIZE = 3000
MAX_TEST_CASES_PER_MSG = 5
MAX_TEST_CASES_PER_MSG = 4
POST_SLACK_MSG_RETRY_TIMES = 5
@ -99,42 +100,74 @@ def post_perf_data(data_list, project_name):
return False
def get_regression_data_by_job_id(data_list, query_job_number):
"""Returns a dict with job_id as key and list of regression data as value.
def get_regression_dict(data_list, query_job_number, lookback_job_number=LOOKBACK_JOBS):
"""Returns a dict with job_id as key and list of regression tuples as value.
Each tuple is (test_case_name, gpu_type, runtime, history_regression_job_ids, data).
Only returns the latest query_job_number jobs.
"""
if data_list is None or len(data_list) == 0:
return {}
# Group data by job_id
job_data_dict = {}
job_test_dict = {}
for data in data_list:
job_id = data.get("s_job_id", "")
if job_id == "":
raw_job_id = data.get("s_job_id", "")
if raw_job_id == "":
continue
if job_id not in job_data_dict:
job_data_dict[job_id] = []
job_data_dict[job_id].append(data)
try:
job_id = int(raw_job_id)
except (TypeError, ValueError):
continue
job_test_dict.setdefault(job_id, []).append(data)
# Sort job_ids by the latest ts_created in each group (descending)
def get_latest_timestamp(job_id):
timestamps = [d.get("ts_created", 0) for d in job_data_dict[job_id]]
return max(timestamps) if timestamps else 0
if not job_test_dict:
return {}
sorted_job_ids = sorted(job_data_dict.keys(), key=get_latest_timestamp, reverse=True)
# Sort job_ids (descending: latest -> oldest)
sorted_job_id_list = sorted(job_test_dict.keys(), reverse=True)
# Only keep the latest query_job_number jobs
latest_job_ids = sorted_job_ids[:query_job_number]
# Build (test_case_name, gpu_type, runtime) -> job_ids dict
test_job_dict = {}
for job_id, data_list in job_test_dict.items():
for data in data_list:
test_case_name = data.get("s_test_case_name") or ""
gpu_type = data.get("s_gpu_type") or ""
runtime = data.get("s_runtime") or ""
if not test_case_name or not gpu_type or not runtime:
continue
key = (test_case_name, gpu_type, runtime)
test_job_dict.setdefault(key, set()).add(job_id)
result = {}
# Sort job ids for each test case (descending: latest -> oldest)
for key, job_id_set in list(test_job_dict.items()):
test_job_dict[key] = sorted(job_id_set, reverse=True)
# Only keep the latest query_job_number jobs in the result
latest_job_ids = sorted_job_id_list[:query_job_number]
regression_dict = {}
for job_id in latest_job_ids:
result[job_id] = job_data_dict[job_id]
entries = []
for data in job_test_dict.get(job_id, []):
test_case_name = data.get("s_test_case_name") or ""
gpu_type = data.get("s_gpu_type") or ""
runtime = data.get("s_runtime") or ""
if not test_case_name or not gpu_type or not runtime:
continue
key = (test_case_name, gpu_type, runtime)
history_ids = test_job_dict.get(key, [])
lower_bound = job_id - lookback_job_number + 1
history_regression_job_ids = [
jid for jid in history_ids if lower_bound <= jid <= job_id
]
entries.append((test_case_name, gpu_type, runtime, history_regression_job_ids, data))
regression_dict[job_id] = entries
return result
return regression_dict
def process_regression_message(regression_dict):
def split_regression_message(regression_dict):
"""Process regression data into message chunks.
Returns a list of messages, each containing at most MAX_TEST_CASES_PER_MSG test cases.
@ -142,12 +175,17 @@ def process_regression_message(regression_dict):
if not regression_dict:
return []
# Flatten all test cases into a list with (job_id, idx, data) tuples
# Flatten all test cases into a list with
# (job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data) tuples
all_test_cases = []
for job_id, data_list in regression_dict.items():
sorted_data_list = sorted(data_list, key=lambda x: x.get("s_test_case_name", ""))
for idx, data in enumerate(sorted_data_list, start=1):
all_test_cases.append((job_id, idx, data))
sorted_data_list = sorted(data_list, key=lambda x: x[0])
for idx, (test_case_name, gpu_type, runtime, history_regression_job_ids, data) in enumerate(
sorted_data_list, start=1
):
all_test_cases.append(
(job_id, idx, test_case_name, gpu_type, runtime, history_regression_job_ids, data)
)
# Split into chunks of MAX_TEST_CASES_PER_MSG
chunks = []
@ -159,7 +197,15 @@ def process_regression_message(regression_dict):
for chunk in chunks:
msg_parts = []
current_job_id = None
for job_id, idx, data in chunk:
for (
job_id,
idx,
test_case_name,
gpu_type,
runtime,
history_regression_job_ids,
data,
) in chunk:
# Add job header when switching to a new job_id
if job_id != current_job_id:
if msg_parts:
@ -168,12 +214,46 @@ def process_regression_message(regression_dict):
msg_parts.append(job_header)
current_job_id = job_id
test_case_name = data.get("s_test_case_name", "N/A")
regression_info = data.get("s_regression_info", "N/A")
history_text = (
", ".join(str(jid) for jid in history_regression_job_ids)
if history_regression_job_ids
else "N/A"
)
msg_parts.append(f"*REGRESSION TEST CASE {idx}: {test_case_name}*\n")
msg_parts.append(f"*GPU: {gpu_type} Mode: {runtime}*\n")
msg_parts.append(f"*History Regression Post-Merge Job IDs: {history_text}*\n")
# Parse regression_info to extract baseline info and metrics
baseline_date = "N/A"
baseline_branch = "N/A"
baseline_commit = "N/A"
for part in regression_info.split(","):
part = part.strip()
if part and "baseline_id" not in part:
if "baseline_date:" in part:
baseline_date = part.split(":", 1)[-1].strip()
elif "baseline_branch:" in part:
baseline_branch = part.split(":", 1)[-1].strip()
elif "baseline_commit:" in part:
baseline_commit = part.split(":", 1)[-1].strip()
# Get regression branch and commit from data
regression_date = data.get("ts_created", "N/A")
regression_branch = data.get("s_branch", "N/A")
regression_commit = data.get("s_commit", "N/A")
msg_parts.append(
f"*Baseline date, branch and commit: "
f"{baseline_date} {baseline_branch} {baseline_commit}*\n"
)
msg_parts.append(
f"*Regression date, branch and commit: "
f"{regression_date} {regression_branch} {regression_commit}*\n"
)
for part in regression_info.split(","):
part = part.strip()
if part and "baseline_" not in part:
msg_parts.append(f" {part}\n")
msg = "".join(msg_parts).strip()
@ -288,8 +368,8 @@ def main():
print("Failed to query regression data")
return
regression_dict = get_regression_data_by_job_id(data_list, args.query_job_number)
messages = process_regression_message(regression_dict)
regression_dict = get_regression_dict(data_list, args.query_job_number)
messages = split_regression_message(regression_dict)
send_regression_message(messages, args.channel_id, args.bot_token)
elif args.operation.strip().upper().startswith("UPDATE"):
set_values, where_values, error = parse_update_operation(args.operation)

View File

@ -58,7 +58,10 @@ cd $llmSrcNode/tests/integration/defs
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
# In disaggregated mode, we only set coverage config file in benchmark pytest.
if [[ -z "${DISAGG_SERVING_TYPE:-}" || "${DISAGG_SERVING_TYPE}" == "BENCHMARK" ]]; then
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
fi
# Only the first process will save the coverage config file
if [ $SLURM_PROCID -eq 0 ]; then

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- B200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:8
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '2048'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 256
max_num_tokens: 512
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 256
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 1
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- B200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:8
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '256'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 32
max_num_tokens: 256
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,96 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- B200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:8
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 256
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- B200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:8
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1536'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 192
max_num_tokens: 384
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 192
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 1
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- B200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:8
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '256'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 32
max_num_tokens: 128
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 1
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,96 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- B200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:8
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 8
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 256
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,96 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '128'
input_length: 131072
output_length: 8192
dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 8
max_num_tokens: 32
tensor_parallel_size: 16
moe_expert_parallel_size: 16
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 8
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 2
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
pipeline_parallel_size: 8
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.3
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,96 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '64'
input_length: 131072
output_length: 8192
dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 8
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 2
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
pipeline_parallel_size: 8
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.3
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,96 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 128k8k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 131072
output_length: 8192
dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 4
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
allreduce_strategy: MNNVL
ctx:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 131104
tensor_parallel_size: 1
moe_expert_parallel_size: 1
pipeline_parallel_size: 8
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.3
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 131104
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1024'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 32
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '3072'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 768
max_num_tokens: 1536
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 768
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 1
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -25,7 +25,7 @@ benchmark:
concurrency_list: '1024'
input_length: 1024
output_length: 1024
dataset_file: <dataset_file>
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
@ -56,18 +56,7 @@ worker_config:
max_seq_len: 2068
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
- 768
max_batch_size: 768
print_iter_log: true
kv_cache_config:
enable_block_reuse: false

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 128
max_num_tokens: 512
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 128
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
allreduce_strategy: MNNVL
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '4096'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 256
max_num_tokens: 512
tensor_parallel_size: 16
moe_expert_parallel_size: 16
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 256
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 1
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1024'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 32
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,97 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 128
max_num_tokens: 512
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 128
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
allreduce_strategy: MNNVL
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,108 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1024'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 32
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: CUTEDSL
load_balancer:
num_slots: 256
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '2048'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 512
max_num_tokens: 1024
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
tokens_per_block: 64
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 1
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 512
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
allreduce_strategy: MNNVL
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 32k4k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '256'
input_length: 32768
output_length: 4096
dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 8
max_num_tokens: 256
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 8
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
dtype: fp8
tokens_per_block: 64
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
ctx:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 32784
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,108 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 32k4k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '2048'
input_length: 32768
output_length: 4096
dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 64
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 64
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85
dtype: fp8
tokens_per_block: 64
moe_config:
backend: CUTEDSL
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 1
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
ctx:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 32784
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 32k4k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 32768
output_length: 4096
dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 256
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
allreduce_strategy: MNNVL
ctx:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 32784
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,104 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '4096'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 128
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 128
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
tokens_per_block: 64
moe_config:
backend: CUTEDSL
load_balancer:
num_slots: 256
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true

View File

@ -0,0 +1,108 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1024'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 32
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
tokens_per_block: 64
moe_config:
backend: CUTEDSL
load_balancer:
num_slots: 256
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek_v32_fp4
precision: fp4
model_dir_name: DeepSeek-V3.2-FP4-v2
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 1
max_num_tokens: 512
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: MTP
num_nextn_predict_layers: 3
num_postprocess_workers: 4
stream_interval: 20
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
allreduce_strategy: MNNVL
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 16384
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
tokens_per_block: 64
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001

View File

@ -0,0 +1,98 @@
metadata:
model_name: k2_thinking_fp4
precision: fp4
model_dir_name: Kimi-K2-Thinking-NVFP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '2048'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 64
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 64
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 384
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 8192
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true

View File

@ -0,0 +1,95 @@
metadata:
model_name: k2_thinking_fp4
precision: fp4
model_dir_name: Kimi-K2-Thinking-NVFP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '4096'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 512
max_num_tokens: 512
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTLASS
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 8192
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true

View File

@ -0,0 +1,94 @@
metadata:
model_name: k2_thinking_fp4
precision: fp4
model_dir_name: Kimi-K2-Thinking-NVFP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '4'
input_length: 1024
output_length: 1024
dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 4
max_num_tokens: 128
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 4
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 16
max_num_tokens: 8192
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true

View File

@ -0,0 +1,98 @@
metadata:
model_name: k2_thinking_fp4
precision: fp4
model_dir_name: Kimi-K2-Thinking-NVFP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 5
benchmark_ratio: 0.0
streaming: true
concurrency_list: '4096'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 256
max_num_tokens: 256
tensor_parallel_size: 16
moe_expert_parallel_size: 16
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 256
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 384
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 8192
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
trust_remote_code: true

View File

@ -0,0 +1,104 @@
metadata:
model_name: k2_thinking_fp4
precision: fp4
model_dir_name: Kimi-K2-Thinking-NVFP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1024'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 32
max_num_tokens: 128
tensor_parallel_size: 32
moe_expert_parallel_size: 32
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 416
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: Eagle
max_draft_len: 3
eagle3_one_model: true
speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3
trust_remote_code: true
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 8192
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001
trust_remote_code: true

View File

@ -0,0 +1,101 @@
metadata:
model_name: k2_thinking_fp4
precision: fp4
model_dir_name: Kimi-K2-Thinking-NVFP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '4'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 4
max_num_tokens: 128
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 4
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: &id001
decoding_type: Eagle
max_draft_len: 3
eagle3_one_model: true
speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3
trust_remote_code: true
num_postprocess_workers: 4
stream_interval: 20
allreduce_strategy: MNNVL
ctx:
print_iter_log: true
max_batch_size: 2
max_num_tokens: 8192
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
disable_overlap_scheduler: true
speculative_config: *id001
trust_remote_code: true

View File

@ -0,0 +1,92 @@
metadata:
model_name: qwen3_235b_a22b_fp4
precision: fp4
model_dir_name: Qwen3-235B-A22B-FP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '1024'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 128
max_num_tokens: 128
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
cuda_graph_config:
enable_padding: true
max_batch_size: 128
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: CUTEDSL
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 4
max_num_tokens: 32768
tensor_parallel_size: 1
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true

View File

@ -0,0 +1,91 @@
metadata:
model_name: qwen3_235b_a22b_fp4
precision: fp4
model_dir_name: Qwen3-235B-A22B-FP4
supported_gpus:
- GB200
script_file: disaggr_torch.slurm
benchmark_type: 8k1k
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: --gres=gpu:4
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: false
multi_round: 10
benchmark_ratio: 0.0
streaming: true
concurrency_list: '64'
input_length: 8192
output_length: 1024
dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes
server_env_var: TRTLLM_SERVER_DISABLE_GC=1
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
print_iter_log: true
max_batch_size: 64
max_num_tokens: 64
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 64
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true
num_postprocess_workers: 4
stream_interval: 20
ctx:
print_iter_log: true
max_batch_size: 4
max_num_tokens: 32768
tensor_parallel_size: 1
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
context_parallel_size: 1
enable_attention_dp: false
cuda_graph_config: null
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 32768
backend: UCX
disable_overlap_scheduler: true

View File

@ -448,9 +448,12 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
continue
is_post_merge = new_data.get("b_is_post_merge", False)
baseline_id = history_baseline.get("_id", "")
info_parts = [f"baseline_id: {baseline_id}"]
info_parts = [
f"baseline_id: {history_baseline.get('_id', '')}",
f"baseline_branch: {history_baseline.get('s_branch', '')}",
f"baseline_commit: {history_baseline.get('s_commit', '')}",
f"baseline_date: {history_baseline.get('ts_created', '')}",
]
regressive_metrics = []
# Check all metrics and build info string
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:

View File

@ -56,6 +56,7 @@ MODEL_PATH_DICT = {
"deepseek_v32_fp4": "DeepSeek-V3.2-Exp-FP4-v2",
"gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
"k2_thinking_fp4": "Kimi-K2-Thinking-NVFP4",
"qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", # Qwen3-235B-A22B-FP4
}
SUPPORTED_GPU_MAPPING = {
@ -68,6 +69,9 @@ SUPPORTED_GPU_MAPPING = {
DEFAULT_TIMEOUT = 7200
AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity"
DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf-sanity"
# Regex patterns for parsing benchmark output metrics
# Key is the metric name used in database (e.g., "mean_e2el", "seq_throughput")
PERF_METRIC_LOG_QUERIES = {
@ -97,9 +101,20 @@ def get_model_dir(model_name: str) -> str:
return ""
def get_dataset_path() -> str:
"""Get dataset path for benchmark."""
return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json")
def get_dataset_dir(dataset_file: Optional[str]) -> str:
"""Get dataset directory path from dataset file."""
if not dataset_file or dataset_file == "<dataset_file>":
return ""
# return os.path.join(llm_models_root(), "datasets", "ShareGPT_V3_unfiltered_cleaned_split.json")
llm_models_path = os.path.join(llm_models_root(), dataset_file)
if os.path.exists(llm_models_path):
return llm_models_path
elif os.path.exists(dataset_file):
return dataset_file
else:
print_info(f"Dataset file not found in {llm_models_path} and {dataset_file}")
return ""
def to_env_dict(env_vars: str) -> Dict[str, str]:
@ -141,6 +156,7 @@ class ServerConfig:
self.disable_overlap_scheduler = server_config_data.get("disable_overlap_scheduler", False)
self.num_postprocess_workers = server_config_data.get("num_postprocess_workers", 0)
self.stream_interval = server_config_data.get("stream_interval", 10)
self.print_iter_log = server_config_data.get("print_iter_log", False)
self.attn_backend = server_config_data.get("attn_backend", "TRTLLM")
self.enable_chunked_prefill = server_config_data.get("enable_chunked_prefill", False)
self.enable_attention_dp = server_config_data.get("enable_attention_dp", False)
@ -213,6 +229,7 @@ class ServerConfig:
self.eagle3_layers_to_capture = []
self.max_draft_len = speculative_config.get("max_draft_len", 0)
self.speculative_model = speculative_config.get("speculative_model", "")
self.eagle3_one_model = speculative_config.get("eagle3_one_model", False)
# match_mode: "config" (default) or "scenario"
self.match_mode = server_config_data.get("match_mode", "config")
@ -340,6 +357,7 @@ class ServerConfig:
"s_eagle3_layers_to_capture": ",".join(map(str, self.eagle3_layers_to_capture)),
"l_max_draft_len": self.max_draft_len,
"s_speculative_model_dir": self.speculative_model,
"b_eagle3_one_model": self.eagle3_one_model,
"s_server_log_link": "",
"s_server_env_var": self.env_vars,
}
@ -366,7 +384,12 @@ class ServerConfig:
class ClientConfig:
"""Configurations of benchmark client."""
def __init__(self, client_config_data: dict, model_name: str, env_vars: str = ""):
def __init__(
self,
client_config_data: dict,
model_name: str,
env_vars: str = "",
):
self.model_name = model_name
self.concurrency = client_config_data.get("concurrency", 1)
self.iterations = client_config_data.get("iterations", 1)
@ -378,6 +401,7 @@ class ClientConfig:
self.streaming = client_config_data.get("streaming", True)
self.trust_remote_code = client_config_data.get("trust_remote_code", True)
self.model_path = ""
self.dataset_file = client_config_data.get("dataset_file", "")
self.env_vars = env_vars
# Generate default name if not provided
@ -389,7 +413,7 @@ class ClientConfig:
"""Generate benchmark command."""
model_dir = get_model_dir(self.model_name)
self.model_path = model_dir if os.path.exists(model_dir) else self.model_name
dataset_path = get_dataset_path()
dataset_path = get_dataset_dir(self.dataset_file)
benchmark_cmd = [
"python",
"-m",
@ -398,9 +422,6 @@ class ClientConfig:
self.model_path,
"--tokenizer",
self.model_path,
"--dataset-name",
"random",
"--random-ids",
"--num-prompts",
str(self.concurrency * self.iterations),
"--max-concurrency",
@ -409,15 +430,27 @@ class ClientConfig:
str(self.isl),
"--random-output-len",
str(self.osl),
"--random-range-ratio",
str(self.random_range_ratio),
"--ignore-eos",
"--no-test-input",
"--percentile-metrics",
"ttft,tpot,itl,e2el",
]
if dataset_path and os.path.exists(dataset_path):
if dataset_path:
benchmark_cmd.append("--dataset-name")
benchmark_cmd.append("trtllm_custom")
benchmark_cmd.append("--dataset-path")
benchmark_cmd.append(dataset_path)
print_info(f"Dataset: {dataset_path} exists. Use trtllm_custom dataset for benchmark.")
else:
benchmark_cmd.append("--dataset-name")
benchmark_cmd.append("random")
benchmark_cmd.append("--random-ids")
benchmark_cmd.append("--random-range-ratio")
benchmark_cmd.append(str(self.random_range_ratio))
print_info(
f"Dataset: {dataset_path} is not provided or does not exist. "
f"Use random dataset (random_range_ratio={self.random_range_ratio}) for benchmark."
)
if self.backend:
benchmark_cmd.append("--backend")
benchmark_cmd.append(self.backend)
@ -453,6 +486,7 @@ class ClientConfig:
"l_isl": self.isl,
"l_osl": self.osl,
"d_random_range_ratio": self.random_range_ratio,
"s_dataset_file": self.dataset_file,
"s_backend": self.backend,
"b_use_chat_template": self.use_chat_template,
"b_streaming": self.streaming,
@ -840,7 +874,7 @@ class PerfSanityTestConfig:
if is_disagg:
# For disagg: disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX
self.runtime = "multi_node_disagg_server"
self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
self.config_dir = DISAGG_CONFIG_FOLDER
config_base = "-".join(labels[1:])
self.config_file = (
f"{config_base}.yaml" if not config_base.endswith(".yaml") else config_base
@ -849,7 +883,7 @@ class PerfSanityTestConfig:
else:
# For aggr: aggr_upload-config_yml or aggr_upload-config_yml-server_config_name
self.runtime = "aggr_server"
self.config_dir = "tests/scripts/perf-sanity"
self.config_dir = AGGR_CONFIG_FOLDER
config_base = labels[1]
self.config_file = (
f"{config_base}.yaml"
@ -922,7 +956,9 @@ class PerfSanityTestConfig:
client_configs = []
for client_config_data in server_config_data["client_configs"]:
client_config = ClientConfig(
client_config_data, server_config_data["model_name"], client_env_var
client_config_data,
server_config_data["model_name"],
env_vars=client_env_var,
)
client_configs.append(client_config)
@ -1026,8 +1062,13 @@ class PerfSanityTestConfig:
"backend": "openai",
"use_chat_template": False,
"streaming": benchmark.get("streaming", True),
"dataset_file": benchmark.get("dataset_file", ""),
}
client_config = ClientConfig(client_config_data, model_name, client_env_var)
client_config = ClientConfig(
client_config_data,
model_name,
env_vars=client_env_var,
)
client_configs.append(client_config)
self.server_client_configs = {0: client_configs}
@ -1417,9 +1458,6 @@ class PerfSanityTestConfig:
AGG_TEST_TYPES = ["aggr_upload", "aggr"]
DISAGG_TEST_TYPES = ["disagg_upload", "disagg"]
AGGR_CONFIG_FOLDER = "tests/scripts/perf-sanity"
DISAGG_CONFIG_FOLDER = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
def get_server_config_names(yaml_path: str) -> List[str]:
"""Read a YAML file and return the list of server_config names."""

View File

@ -0,0 +1,21 @@
version: 0.0.1
l0_dgx_b200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8:
- condition:
ranges:
# 2 nodes with each node has 8 GPUs
system_gpu_count:
gte: 16
lte: 16
wildcards:
gpu:
- '*b200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-b200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (90)

View File

@ -1,5 +1,5 @@
version: 0.0.1
l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes:
l0_gb200_multi_nodes_aggr_perf_sanity_node2_gpu8:
- condition:
ranges:
# 2 nodes with each node has 4 GPUs

View File

@ -1,16 +0,0 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes:
- condition:
ranges:
# 3 nodes with each node has 4 GPUs
system_gpu_count:
gte: 12
lte: 12
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (90)

View File

@ -1,17 +0,0 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes:
- condition:
ranges:
# 6 nodes with each node has 4 GPUs
system_gpu_count:
gte: 24
lte: 24
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (90)

View File

@ -1,16 +0,0 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes:
- condition:
ranges:
# 8 nodes with each node has 4 GPUs
system_gpu_count:
gte: 32
lte: 32
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)

View File

@ -0,0 +1,17 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4:
- condition:
ranges:
# 1 ctx worker with each 1 node and 1 GPU
# 1 gen worker with each 1 node and 4 GPUs
system_gpu_count:
gte: 8
lte: 8
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,17 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8:
- condition:
ranges:
# 1 ctx worker with each 1 node and 1 GPU
# 1 gen worker with each 2 nodes and 8 GPUs
system_gpu_count:
gte: 12
lte: 12
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-qwen3-235b-fp4_8k1k_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,19 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4:
- condition:
ranges:
# 1 ctx worker with each 1 node and 4 GPUs
# 1 gen worker with each 1 node and 4 GPUs
system_gpu_count:
gte: 8
lte: 8
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,24 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8:
- condition:
ranges:
# 1 ctx worker with each 1 node and 4 GPUs
# 1 gen worker with each 2 nodes and 8 GPUs
system_gpu_count:
gte: 12
lte: 12
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
# - perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,18 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16:
- condition:
ranges:
# 1 ctx worker with each 1 node and 4 GPUs
# 1 gen worker with each 4 nodes and 16 GPUs
system_gpu_count:
gte: 20
lte: 20
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep16_eplb384_mtp0_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,25 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32:
- condition:
ranges:
# 1 ctx worker with each 1 node and 4 GPUs
# 1 gen worker with each 8 nodes and 32 GPUs
system_gpu_count:
gte: 36
lte: 36
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_32k4k_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-v32-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_1k1k_ctx1_dep4_gen1_dep32_eplb384_mtp0_ccb-UCX] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-kimi-k2-thinking-fp4_8k1k_ctx1_dep4_gen1_dep32_eplb416_mtp3_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,17 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8:
- condition:
ranges:
# 1 ctx worker with each 2 nodes and 8 GPUs
# 1 gen worker with each 2 nodes and 8 GPUs
system_gpu_count:
gte: 16
lte: 16
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,17 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16:
- condition:
ranges:
# 1 ctx worker with each 2 nodes and 8 GPUs
# 1 gen worker with each 4 nodes and 16 GPUs
system_gpu_count:
gte: 24
lte: 24
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120)

View File

@ -0,0 +1,17 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32:
- condition:
ranges:
# 1 ctx worker with each 2 nodes and 8 GPUs
# 1 gen worker with each 8 nodes and 32 GPUs
system_gpu_count:
gte: 40
lte: 40
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-gb200-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120)

View File

@ -291,9 +291,7 @@ unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks SKIP (htt
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5819019)
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_gptoss_style_nvfp4[limit1-beta0-alpha1-RoutingGPTOSS-512-512-1] SKIP (https://nvbugs/5819042)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5819048)
perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/5819053)
perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_2_nodes_grace_blackwell-k2_thinking_fp4_tep8_32k8k] SKIP (https://nvbugs/5819053)
perf/test_perf_sanity.py::test_e2e[aggr_upload-k2_thinking_fp4_grace_blackwell-k2_thinking_fp4_tep4_8k1k] SKIP (https://nvbugs/5820541)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5819021)
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_cache_aware_balance[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5820576)
llmapi/test_llm_examples.py::test_llmapi_tensorrt_engine SKIP (https://nvbugs/5820553)
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False] SKIP (https://nvbugs/5820938)
@ -317,10 +315,8 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/5701445)
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5820734)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] SKIP (https://nvbugs/5819053)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5823284)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] SKIP (https://nvbugs/5819053)
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5819053)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5826604)
disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5834212)
accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_multi_gpus[throughput_trtllm] SKIP (https://nvbugs/5837275)
@ -337,7 +333,6 @@ test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5819444)
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5819452)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5800646)
accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] SKIP (https://nvbugs/5819053)
examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct] SKIP (https://nvbugs/5838178)
accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 SKIP (https://nvbugs/5838184)
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5838199)
@ -362,7 +357,6 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bflo
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5846154)
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] SKIP (https://nvbugs/5846166)
accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)

View File

@ -38,8 +38,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP8 with CUTLASS, MTP1
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
@ -74,8 +74,8 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
# 1k1k configs - TEP8 with TRTLLM, MTP3
- name: "r1_fp4_v2_tep8_mtp3"
@ -105,5 +105,5 @@ server_configs:
iterations: 12
isl: 1024
osl: 1024
random_range_ratio: 0.8
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json

View File

@ -31,8 +31,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 1k1k configs - DEP8 with CUTLASS, MTP1
- name: "r1_fp4_v2_dep8_mtp1_1k1k"
@ -66,8 +66,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 8k1k configs - TP4 with TRTLLM, MTP3
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
@ -97,8 +97,8 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP8 with CUTLASS, MTP1
- name: "r1_fp4_v2_dep8_mtp1_8k1k"
@ -132,5 +132,5 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

View File

@ -31,20 +31,13 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.8
backend: "openai"
- name: "con1024_iter10_1k1k"
concurrency: 1024
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 1k1k configs - TEP4 with TRTLLM, MTP3
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
@ -74,8 +67,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.8
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 1k1k configs - TP4 with TRTLLM, MTP3
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
@ -100,20 +93,13 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con4_iter10_1k1k"
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.8
backend: "openai"
- name: "con2_iter10_1k1k"
concurrency: 2
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP4 with CUTLASS, MTP1
- name: "r1_fp4_v2_dep4_mtp1_8k1k"
@ -142,20 +128,13 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter5_8k1k"
concurrency: 2048
iterations: 5
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "con256_iter10_8k1k"
concurrency: 256
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - TEP4 with TRTLLM, MTP3
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
@ -185,8 +164,8 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - TP4 with TRTLLM, MTP3
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
@ -211,20 +190,13 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con4_iter10_8k1k"
concurrency: 4
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "con2_iter10_8k1k"
concurrency: 2
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
# 1k8k configs - DEP4 with CUTLASS, MTP1
- name: "r1_fp4_v2_dep4_mtp1_1k8k"
@ -258,8 +230,8 @@ server_configs:
iterations: 5
isl: 1024
osl: 8192
random_range_ratio: 0.8
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json
# 1k8k configs - TEP4 with TRTLLM, MTP3
- name: "r1_fp4_v2_tep4_mtp3_1k8k"
@ -289,8 +261,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.8
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json
# 1k8k configs - TP4 with TRTLLM, MTP3
- name: "r1_fp4_v2_tp4_mtp3_1k8k"
@ -320,5 +292,5 @@ server_configs:
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.8
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k8k-20480-ratio-1_for_serve.json

View File

@ -31,8 +31,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 1k1k configs - DEP8 with DEEPGEMM, MTP1
- name: "r1_fp8_dep8_mtp1_1k1k"
@ -66,8 +66,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json
# 8k1k configs - TP8 with TRTLLM, MTP3
- name: "r1_fp8_tp8_mtp3_8k1k"
@ -97,8 +97,8 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP8 with DEEPGEMM, MTP1
- name: "r1_fp8_dep8_mtp1_8k1k"
@ -132,5 +132,5 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json

View File

@ -31,8 +31,8 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP8 with CUTLASS, MTP1
- name: "v32_fp4_dep8_mtp1_8k1k"
@ -66,5 +66,5 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json

View File

@ -31,8 +31,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
# 1k1k configs - DEP4 with CUTLASS, MTP1
- name: "v32_fp4_dep4_mtp1_1k1k"
@ -66,8 +66,8 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json
# 8k1k configs - TEP4 with TRTLLM, MTP3
- name: "v32_fp4_tep4_mtp3_8k1k"
@ -97,8 +97,8 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP4 with CUTLASS, MTP1
- name: "v32_fp4_dep4_mtp1_8k1k"
@ -132,5 +132,5 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json

View File

@ -32,7 +32,7 @@ server_configs:
iterations: 5
isl: 1024
osl: 8192
random_range_ratio: 0.8
random_range_ratio: 0.0
backend: "openai"
- name: "gpt_oss_fp4_dep2_1k1k"
@ -63,7 +63,7 @@ server_configs:
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.8
random_range_ratio: 0.0
backend: "openai"
- name: "gpt_oss_fp4_tep2_1k8k"
@ -92,7 +92,7 @@ server_configs:
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.8
random_range_ratio: 0.0
backend: "openai"
- name: "gpt_oss_fp4_tp2_1k8k"
@ -121,7 +121,7 @@ server_configs:
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.8
random_range_ratio: 0.0
backend: "openai"
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
@ -155,5 +155,5 @@ server_configs:
iterations: 32
isl: 1024
osl: 1024
random_range_ratio: 0.8
random_range_ratio: 0.0
backend: "openai"

View File

@ -32,9 +32,9 @@ server_configs:
iterations: 10
isl: 32768
osl: 8192
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
# 32k8k configs - DEP8 with CUTLASS
- name: "k2_thinking_fp4_dep8_32k8k"
@ -67,6 +67,6 @@ server_configs:
iterations: 10
isl: 32768
osl: 8192
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json

View File

@ -29,9 +29,9 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP8 with CUTLASS
- name: "k2_thinking_fp4_dep8_8k1k"
@ -63,9 +63,9 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
# 32k8k configs - TEP8 with TRTLLM
- name: "k2_thinking_fp4_tep8_32k8k"
@ -94,9 +94,9 @@ server_configs:
iterations: 10
isl: 32768
osl: 8192
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json
# 32k8k configs - DEP8 with CUTLASS
- name: "k2_thinking_fp4_dep8_32k8k"
@ -129,6 +129,6 @@ server_configs:
iterations: 10
isl: 32768
osl: 8192
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-32k8k-20480-ratio-1_for_serve.json

View File

@ -29,9 +29,9 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json
# 8k1k configs - DEP4 with CUTLASS
- name: "k2_thinking_fp4_dep4_8k1k"
@ -63,6 +63,6 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
backend: "openai"
trust_remote_code: true
dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json