[TRTLLM-8952][feat] Support Multi-Node Disagg Perf Test in CI (#9138)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
chenfeiz0326 2025-12-26 22:50:53 +08:00 committed by GitHub
parent 684b37df02
commit d70aeddc7f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
43 changed files with 1680 additions and 899 deletions

View File

@ -748,9 +748,9 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
} }
// End of Methods to run Slurm job with Jenkins Agent // End of Methods to run Slurm job with Jenkins Agent
def getNodeArgs(int nodeCount, int gpuCount) { def getNodeArgs(int nodeCount, int gpuCount, boolean setSegment = false) {
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue() int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
return nodeCount == 1 ? [ def args = nodeCount == 1 ? [
"--nodes=${nodeCount}", "--nodes=${nodeCount}",
"--gpus=${gpuCount}" "--gpus=${gpuCount}"
] : [ ] : [
@ -759,6 +759,10 @@ def getNodeArgs(int nodeCount, int gpuCount) {
"--ntasks-per-node=${gpusPerNode}", "--ntasks-per-node=${gpusPerNode}",
"--gpus-per-node=${gpusPerNode}", "--gpus-per-node=${gpusPerNode}",
] ]
if (setSegment && gpuCount > 1) {
args += ["--segment=${nodeCount}"]
}
return args
} }
def getPytestBaseCommandLine( def getPytestBaseCommandLine(
@ -883,6 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Create a unique suffix for the job name // Create a unique suffix for the job name
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase() String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}" def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
def setSegment = disaggMode
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh") Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@ -914,6 +920,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src" def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh" def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh" def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt" def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt"
def outputPath = "${jobWorkspace}/job-output.log" def outputPath = "${jobWorkspace}/job-output.log"
@ -940,6 +948,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
true true
) )
Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptInstallLocalPath,
scriptInstallPathNode,
true
)
// Generate Test List and Upload to Frontend Node // Generate Test List and Upload to Frontend Node
def makoArgs = getMakoArgsFromStageName(stageName, true) def makoArgs = getMakoArgsFromStageName(stageName, true)
// TODO: currently the options will only be processed if the first // TODO: currently the options will only be processed if the first
@ -1013,7 +1030,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Generate Job Launch Script // Generate Job Launch Script
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
def mounts = getMountListForSlurmTest(cluster, true).join(",") def mounts = getMountListForSlurmTest(cluster, true).join(",")
String[] taskArgs = getNodeArgs(nodeCount, gpuCount) String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
if (taskArgs == null) { if (taskArgs == null) {
error "Invalid Slurm test stage name is set" error "Invalid Slurm test stage name is set"
} }
@ -1083,10 +1100,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
envVarsToExport.each { varName, varValue -> envVarsToExport.each { varName, varValue ->
srunArgs.add("--container-env=${varName}") srunArgs.add("--container-env=${varName}")
} }
if(nodeCount > 1) {
srunArgs.add("--mpi=pmi2")
}
def exemptionComment = "" def exemptionComment = ""
if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) { if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'""" exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
@ -1102,8 +1115,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
"export ${varName}=\"${escapedValue}\"" "export ${varName}=\"${escapedValue}\""
}.join('\n') }.join('\n')
def scriptContent = """#!/bin/bash def scriptLaunchPrefix = """#!/bin/bash
#SBATCH ${exemptionComment} --output=${outputPath} #SBATCH ${exemptionComment}
#SBATCH --output=${outputPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')} ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs} #SBATCH ${partition.additionalArgs}
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""} ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
@ -1128,10 +1142,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES" echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"
${srunPrologue} ${srunPrologue}
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
""".replaceAll("(?m)^\\s*", "") """.replaceAll("(?m)^\\s*", "")
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
if (disaggMode) {
if(nodeCount > 1) {
srunArgs.add("--mpi=pmix")
}
def scriptLaunchPrefixPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch_prefix.sh")
def scriptLaunchSrunArgsPathLocal = Utils.createTempLocation(pipeline, "./slurm_srun_args.txt")
def scriptLaunchDraftPathLocal = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh"
def scriptSubmitLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/submit.py"
pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")
// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
sh """
python3 ${scriptSubmitLocalPath} \\
--run-ci \\
--llm-src ${llmSrcLocal} \\
--test-list ${testListPathLocal} \\
--draft-launch-sh ${scriptLaunchDraftPathLocal} \\
--launch-sh ${scriptLaunchPathLocal} \\
--run-sh ${scriptRunPathNode} \\
--install-sh ${scriptInstallPathNode} \\
--script-prefix ${scriptLaunchPrefixPathLocal} \\
--srun-args ${scriptLaunchSrunArgsPathLocal}
"""
} else {
if(nodeCount > 1) {
srunArgs.add("--mpi=pmi2")
}
def scriptContent = """
${scriptLaunchPrefix}
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
""".replaceAll("(?m)^\\s*", "")
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
}
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}") Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
Utils.copyFileToRemoteHost( Utils.copyFileToRemoteHost(
pipeline, pipeline,
@ -2634,7 +2686,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
if (noRegularTests && noIsolateTests) { if (noRegularTests && noIsolateTests) {
error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result." error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result."
} }
} }
} }
@ -2653,7 +2704,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
stage("Check perf result") { stage("Check perf result") {
def perfCheckResult = sh( def perfCheckResult = sh(
script: """ script: """
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \ python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
${stageName}/perf_script_test_results.csv \ ${stageName}/perf_script_test_results.csv \
${basePerfPath} ${basePerfPath}
""", """,
@ -2672,6 +2723,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
""" """
} }
} }
if (perfMode && stageName.contains("Perf-Sanity")) {
stage ("Check perf result") {
def perfCheckResult = sh(
script: """
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
${WORKSPACE}/${stageName}
""",
returnStatus: true
)
// TODO: Enable this when perf regression check is stable
// if (perfCheckResult != 0) {
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
// }
}
}
} }
} }
@ -3111,8 +3178,13 @@ def launchTestJobs(pipeline, testFilter)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
// Perf sanity post merge test // Perf sanity post merge aggr tests
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
// Perf sanity post merge disagg tests
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
] ]
fullSet += multiNodesSBSAConfigs.keySet() fullSet += multiNodesSBSAConfigs.keySet()

View File

@ -0,0 +1,76 @@
cleanup_on_failure() {
echo "Error: $1"
scancel ${SLURM_JOB_ID}
exit 1
}
mkdir -p $jobWorkspace
chmod +x $runScript
chmod +x $installScript
# Run installation on all nodes
echo "Running installation on all nodes..."
if ! srun "${srunArgs[@]}" $installScript &> $jobWorkspace/install.log; then
cleanup_on_failure "Failed to run installation. Check $jobWorkspace/install.log"
fi
echo "Installation completed on all nodes"
# Start gen servers
echo "Starting gen servers..."
for i in $(seq 0 $((numGenServers - 1))); do
gen_world_size=$((nodesPerGenServer * gpusPerNode))
export DISAGG_SERVING_TYPE="GEN_$i"
export pytestCommand="$pytestCommandWorker"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
-N $nodesPerGenServer \
--ntasks=$gen_world_size \
--ntasks-per-node=$gpusPerNode \
$runScript &> $jobWorkspace/gen_server_$i.log &
echo "Started gen server $i"
done
# Start ctx servers (skip if gen_only mode)
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
echo "Starting ctx servers..."
for i in $(seq 0 $((numCtxServers - 1))); do
ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
export DISAGG_SERVING_TYPE="CTX_$i"
export pytestCommand="$pytestCommandWorker"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
-N $nodesPerCtxServer \
--ntasks=$ctx_world_size \
--ntasks-per-node=$gpusPerNode \
$runScript &> $jobWorkspace/ctx_server_$i.log &
echo "Started ctx server $i"
done
else
echo "Skipping ctx servers (gen_only mode)"
fi
# Start disagg server
echo "Starting disagg server..."
export DISAGG_SERVING_TYPE="DISAGG_SERVER"
export pytestCommand="$pytestCommandDisaggServer"
srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
-N 1 \
--ntasks=1 \
--ntasks-per-node=1 \
$runScript &> $jobWorkspace/disagg_server.log &
echo "Started disagg server"
# Start benchmark
echo "Starting benchmark..."
export DISAGG_SERVING_TYPE="BENCHMARK"
export pytestCommand="$pytestCommandBenchmark"
if ! srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
-N 1 \
--ntasks=1 \
--ntasks-per-node=1 \
$runScript; then
cleanup_on_failure "Benchmark failed. Check logs in ${jobWorkspace} for details"
fi
echo "Disagg server and benchmark completed successfully"
echo "Total runtime: $SECONDS seconds"

View File

@ -0,0 +1,292 @@
#!/usr/bin/env python3
import argparse
import os
import yaml
def get_hardware_config(config, benchmark_mode):
hardware = config.get("hardware", {})
worker_config = config.get("worker_config", {})
num_ctx_servers = 0 if "gen_only" in benchmark_mode else hardware.get("num_ctx_servers")
num_gen_servers = hardware.get("num_gen_servers")
gpus_per_node = hardware.get("gpus_per_node")
# Get gpus_per_ctx_server and gpus_per_gen_server from worker_config's tensor_parallel_size
ctx_config = worker_config.get("ctx", {})
gen_config = worker_config.get("gen", {})
ctx_tp = ctx_config.get("tensor_parallel_size", 1)
ctx_pp = ctx_config.get("pipeline_parallel_size", 1)
ctx_cp = ctx_config.get("context_parallel_size", 1)
gpus_per_ctx_server = ctx_tp * ctx_pp * ctx_cp
gen_tp = gen_config.get("tensor_parallel_size", 1)
gen_pp = gen_config.get("pipeline_parallel_size", 1)
gen_cp = gen_config.get("context_parallel_size", 1)
gpus_per_gen_server = gen_tp * gen_pp * gen_cp
if None in [
num_ctx_servers,
num_gen_servers,
gpus_per_node,
gpus_per_ctx_server,
gpus_per_gen_server,
]:
raise ValueError("Missing required hardware configuration")
# Calculate nodes per server
nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node
nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node
total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server
total_gpus = total_nodes * gpus_per_node
return {
"num_ctx_servers": num_ctx_servers,
"num_gen_servers": num_gen_servers,
"gpus_per_node": gpus_per_node,
"gpus_per_ctx_server": gpus_per_ctx_server,
"gpus_per_gen_server": gpus_per_gen_server,
"nodes_per_ctx_server": nodes_per_ctx_server,
"nodes_per_gen_server": nodes_per_gen_server,
"total_nodes": total_nodes,
"total_gpus": total_gpus,
}
def get_env_config(config):
env = config.get("environment", {})
container = env.get("container_image", "")
mounts = env.get("container_mount", "")
workdir = env.get("container_workdir", "")
llm_models_root = env.get("llm_models_root", "")
llmsrc = env.get("trtllm_repo", "")
build_wheel = env.get("build_wheel", False)
# Use work_dir as job_workspace
job_workspace = env.get("work_dir", "")
worker_env_var = env.get("worker_env_var", "")
server_env_var = env.get("server_env_var", "")
benchmark_env_var = env.get("benchmark_env_var", "")
open_search_db_base_url = env.get("open_search_db_base_url", "")
return {
"container": container,
"mounts": mounts,
"workdir": workdir,
"llm_models_root": llm_models_root,
"llmsrc": llmsrc,
"build_wheel": build_wheel,
"job_workspace": job_workspace,
"worker_env_var": worker_env_var,
"server_env_var": server_env_var,
"benchmark_env_var": benchmark_env_var,
"open_search_db_base_url": open_search_db_base_url,
}
def get_benchmark_config(config):
benchmark = config.get("benchmark", {})
mode = benchmark.get("mode", "e2e")
concurrency_str = benchmark.get("concurrency_list", "1")
concurrency = int(concurrency_str) if isinstance(concurrency_str, str) else concurrency_str
return {
"mode": mode,
"concurrency": concurrency,
}
def remove_whitespace_lines(lines):
return [line.strip() for line in lines if line.strip()]
def get_pytest_command_no_llmapilaunch(script_prefix_lines):
pytest_command_line = None
for line in script_prefix_lines:
if "export pytestCommand=" in line:
pytest_command_line = line
break
if not pytest_command_line:
return ""
# Replace pytestCommand with pytestCommandNoLLMAPILaunch
replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch")
# Split by space, find and remove the substring with trtllm-llmapi-launch
replaced_line_parts = replaced_line.split()
replaced_line_parts_no_llmapi = [
part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part
]
return " ".join(replaced_line_parts_no_llmapi)
def get_config_yaml(test_list_path, llm_src):
with open(test_list_path, "r") as f:
first_line = f.readline().strip()
if "[" not in first_line or "]" not in first_line:
raise ValueError(
f"Invalid test list format. Expected test name with brackets: {first_line}"
)
bracket_content = first_line.split("[")[-1].split("]")[0]
parts = bracket_content.split("-")
if len(parts) < 2:
raise ValueError(
f"Invalid test name format. Expected format: prefix-config_name, got: {bracket_content}"
)
# parts[0] is the prefix, parts[1:] is the config name
if "disagg" not in parts[0]:
raise ValueError(
f"Invalid test name format. Expected format: disagg-config_name, got: {bracket_content}"
)
config_base_name = "-".join(parts[1:])
config_yaml_path = os.path.join(
llm_src,
"tests",
"integration",
"defs",
"perf",
"disagg",
"test_configs",
"disagg",
"perf",
f"{config_base_name}.yaml",
)
if not os.path.exists(config_yaml_path):
raise FileNotFoundError(f"Config file not found: {config_yaml_path}")
return config_yaml_path
def main():
parser = argparse.ArgumentParser(
description="Generate SLURM launch script for both CI and local modes"
)
parser.add_argument(
"--run-ci",
action="store_true",
default=False,
help="Run in CI mode (true) or local mode (false)",
)
parser.add_argument("--draft-launch-sh", required=True, help="Path to draft-launch.sh script")
parser.add_argument("--launch-sh", required=True, help="Path to output launch.sh script")
parser.add_argument("--run-sh", required=True, help="Path to slurm_run.sh script")
parser.add_argument("--install-sh", required=True, help="Path to slurm_install.sh script")
# Optional arguments for local mode
parser.add_argument("--config-yaml", default="", help="Path to config YAML file")
parser.add_argument("--stage-name", default="", help="Stage name (optional, local mode only)")
# Optional arguments for CI mode
parser.add_argument("--llm-src", default="", help="Path to LLM source code")
parser.add_argument("--test-list", default="", help="Path to test list file")
parser.add_argument(
"--script-prefix",
default="",
help="Launch script prefix file path (optional, CI mode only)",
)
parser.add_argument(
"--srun-args",
default="",
help="Path to file containing srun args (optional, CI mode only)",
)
args = parser.parse_args()
config_yaml = get_config_yaml(args.test_list, args.llm_src)
with open(config_yaml, "r") as f:
config = yaml.safe_load(f)
# Determine install script path
install_script = args.install_sh
env_config = get_env_config(config)
print(f"Environment configuration: {env_config}")
benchmark_config = get_benchmark_config(config)
print(f"Benchmark configuration: {benchmark_config}")
benchmark_mode = benchmark_config["mode"]
hardware_config = get_hardware_config(config, benchmark_mode)
print(f"Hardware configuration: {hardware_config}")
script_prefix_lines = []
srun_args_lines = []
with open(args.script_prefix, "r") as f:
script_prefix_content = f.read()
script_prefix_lines = script_prefix_content.split("\n")
with open(args.srun_args, "r") as f:
srun_args_content = f.read()
srun_args_lines = srun_args_content.split()
# Extract pytestCommand and generate pytestCommandNoLLMAPILaunch
pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines)
# Build worker env vars, add extra env vars for gen_only mode
worker_env_vars = env_config["worker_env_var"]
server_env_vars = env_config["server_env_var"]
if "gen_only" in benchmark_config["mode"]:
concurrency = benchmark_config["concurrency"]
worker_env_vars = (
"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 "
f"TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1 "
f"TLLM_BENCHMARK_REQ_QUEUES_SIZE={concurrency} {worker_env_vars}"
)
server_env_vars = f"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 {server_env_vars}"
script_prefix_lines.append("export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1")
srun_args_lines.append("--container-env=TRTLLM_DISAGG_BENCHMARK_GEN_ONLY")
script_prefix_lines.extend(
[
pytest_command_no_llmapi_launch,
f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"',
f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"',
f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"',
f"export runScript={args.run_sh}",
f"export installScript={install_script}",
f"export numCtxServers={hardware_config['num_ctx_servers']}",
f"export numGenServers={hardware_config['num_gen_servers']}",
f"export gpusPerNode={hardware_config['gpus_per_node']}",
f"export gpusPerCtxServer={hardware_config['gpus_per_ctx_server']}",
f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}",
f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}",
f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}",
f"export totalNodes={hardware_config['total_nodes']}",
f"export totalGpus={hardware_config['total_gpus']}",
]
)
remove_whitespace_lines(script_prefix_lines)
script_prefix = "\n".join(script_prefix_lines)
remove_whitespace_lines(srun_args_lines)
srun_args_lines.extend(
[
"--container-env=DISAGG_SERVING_TYPE",
"--container-env=pytestCommand",
]
)
srun_args_lines = ["srunArgs=("] + [f' "{line}"' for line in srun_args_lines] + [")"]
srun_args = "\n".join(srun_args_lines)
with open(args.draft_launch_sh, "r") as f:
draft_launch_content = f.read()
draft_launch_lines = draft_launch_content.split("\n")
remove_whitespace_lines(draft_launch_lines)
draft_launch_content = "\n".join(draft_launch_lines)
with open(args.launch_sh, "w") as f:
f.write(f"{script_prefix}\n{srun_args}\n{draft_launch_content}")
print(f"Launch script generated at: {args.launch_sh}")
print(f"Launch script:\n{script_prefix}\n{srun_args}\n{draft_launch_content}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,37 @@
#!/bin/bash
# Set up error handling
set -Eeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
slurm_install_setup() {
cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
if [ $SLURM_LOCALID -eq 0 ]; then
wget -nv $llmTarfile
tar -zxf $tarName
which python3
python3 --version
apt-get install -y libffi-dev
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
if [[ $pytestCommand == *--run-ray* ]]; then
pip3 install --retries 10 ray[default]
fi
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
touch install_lock.lock
else
while [ ! -f install_lock.lock ]; do
sleep 5
done
fi
}
# Only run slurm_install_setup when script is executed directly (not sourced)
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
slurm_install_setup
fi

View File

@ -39,26 +39,12 @@ if [ $SLURM_PROCID -eq 0 ]; then
fi fi
fi fi
if [ $SLURM_LOCALID -eq 0 ]; then # Aggregated mode will run install together with pytest in slurm_run.sh
wget -nv $llmTarfile # Disaggregated mode will run install separately in slurm_install.sh
tar -zxf $tarName if [[ "$stageName" != *Disagg* ]]; then
which python3 installScriptPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_run\.sh/slurm_install.sh/')"
python3 --version source "$installScriptPath"
apt-get install -y libffi-dev slurm_install_setup
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
if [[ $pytestCommand == *--run-ray* ]]; then
pip3 install --retries 10 ray[default]
fi
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
touch install_lock.lock
else
while [ ! -f install_lock.lock ]; do
sleep 5
done
fi fi
if [[ "$stageName" == *GB200* ]]; then if [[ "$stageName" == *GB200* ]]; then
@ -131,3 +117,9 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
--files $stageName/perf_script_test_results.csv \ --files $stageName/perf_script_test_results.csv \
$basePerfPath $basePerfPath
fi fi
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
echo "Check Perf-Sanity Result"
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
$jobWorkspace
fi

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -0,0 +1,105 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: -1
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:4"
numa_bind: true
benchmark:
mode: e2e
use_nv_sa_benchmark: true
multi_round: 8
benchmark_ratio: 0.8
streaming: true
concurrency_list: '6144'
input_length: 1024
output_length: 1024
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 8
moe_expert_parallel_size: 8
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
pipeline_parallel_size: 1
context_parallel_size: 1
max_batch_size: 768
max_num_tokens: 768
max_seq_len: 2068
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
- 768
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
dtype: fp8
moe_config:
backend: CUTLASS
use_low_precision_moe_combine: true
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
stream_interval: 100
num_postprocess_workers: 4
ctx:
max_batch_size: 16
max_num_tokens: 16896
max_seq_len: 2044
tensor_parallel_size: 4
context_parallel_size: 1
moe_expert_parallel_size: 4
enable_attention_dp: true
pipeline_parallel_size: 1
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -0,0 +1,122 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus:
- GB200
- GB300
script_file: disaggr_torch.slurm
benchmark_type: 1k1k
config_index: -1
slurm:
script_file: disaggr_torch.slurm
partition: <partition>
account: <account>
job_time: 02:00:00
job_name: unified-benchmark
extra_args: "--gres=gpu:4"
numa_bind: true
benchmark:
mode: gen_only
use_nv_sa_benchmark: true
multi_round: 1
benchmark_ratio: 0.8
streaming: true
concurrency_list: '1024'
input_length: 8192
output_length: 1024
dataset_file: <dataset_file>
hardware:
gpus_per_node: 4
num_ctx_servers: 1
num_gen_servers: 1
environment:
container_mount: <container_mount>
container_image: <container_image>
model_path: <model_path>
trtllm_repo: ''
build_wheel: false
work_dir: <full_path_to_work_dir>
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
tensor_parallel_size: 32
moe_expert_parallel_size: 32
context_parallel_size: 1
enable_attention_dp: true
enable_lm_head_tp_in_adp: true
pipeline_parallel_size: 1
max_batch_size: 128
max_num_tokens: 512
max_seq_len: 9256
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 24
- 32
- 40
- 48
- 56
- 64
- 72
- 80
- 88
- 96
- 104
- 112
- 120
- 128
print_iter_log: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: CUTEDSL
use_low_precision_moe_combine: true
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
stream_interval: 100
num_postprocess_workers: 4
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
max_batch_size: 2
max_num_tokens: 16896
max_seq_len: 9256
tensor_parallel_size: 4
context_parallel_size: 1
moe_expert_parallel_size: 4
enable_attention_dp: true
pipeline_parallel_size: 1
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
cache_transceiver_config:
max_tokens_in_buffer: 16384
backend: UCX
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -1,5 +1,5 @@
metadata: metadata:
model_name: deepseek-r1-fp4 model_name: deepseek_r1_0528_fp4_v2
precision: fp4 precision: fp4
model_dir_name: DeepSeek-R1-0528-FP4-v2 model_dir_name: DeepSeek-R1-0528-FP4-v2
supported_gpus: supported_gpus:

View File

@ -33,6 +33,8 @@ from jenkins.scripts.open_search_db import OpenSearchDB
PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf" PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf"
TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info" TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
PRE_MERGE_THRESHOLD = 0.1
POST_MERGE_THRESHOLD = 0.05
# Metrics where larger is better # Metrics where larger is better
MAXIMIZE_METRICS = [ MAXIMIZE_METRICS = [
@ -268,24 +270,7 @@ def match(history_data, new_data, match_keys):
def is_empty(value): def is_empty(value):
return value is None or value == "" return value is None or value == ""
def should_skip_field(field):
# Skip fields starting with @, _, ts_
if field.startswith('@') or field.startswith('_') or field.startswith(
'ts_'):
return True
# Skip log links and speculative_model_dir and job configs
if field in [
's_speculative_model_dir', 's_server_log_link',
's_ctx_server_log_link', 's_gen_server_log_link',
's_client_log_link'
]:
return True
return False
for field in match_keys: for field in match_keys:
# Skip excluded fields
if should_skip_field(field):
continue
history_value = history_data.get(field, None) history_value = history_data.get(field, None)
new_value = new_data.get(field, None) new_value = new_data.get(field, None)
if is_empty(history_value) and is_empty(new_value): if is_empty(history_value) and is_empty(new_value):
@ -412,6 +397,33 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
return history_baseline_dict, history_data_dict return history_baseline_dict, history_data_dict
def get_threshold(baseline_data, metric):
"""
Get the threshold for a metric from baseline data.
"""
is_post_merge = baseline_data.get("b_is_post_merge", False)
metric_suffix = metric[2:] # Remove "d_" prefix
if is_post_merge:
threshold_key = f"d_threshold_post_merge_{metric_suffix}"
else:
threshold_key = f"d_threshold_pre_merge_{metric_suffix}"
# Try to get the specific threshold (post_merge or pre_merge)
if threshold_key in baseline_data:
return baseline_data[threshold_key]
# Fall back to general threshold
fallback_key = f"d_threshold_{metric_suffix}"
if fallback_key in baseline_data:
return baseline_data[fallback_key]
# No threshold found, raise error
raise KeyError(
f"No threshold found for metric '{metric}'. "
f"Expected '{threshold_key}' or '{fallback_key}' in baseline data.")
def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
"""Get regressive test cases """Get regressive test cases
1. For Maximize metrics, if new perf is below baseline * (1 - threshold) 1. For Maximize metrics, if new perf is below baseline * (1 - threshold)
@ -419,8 +431,9 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
Set it as regressive. Set it as regressive.
""" """
regressive_data_list = [] regressive_data_list = []
cmd_idxs = new_data_dict.keys()
# Find regressive test cases # Find regressive test cases
for cmd_idx in new_data_dict: for cmd_idx in cmd_idxs:
if history_baseline_dict[cmd_idx] is None: if history_baseline_dict[cmd_idx] is None:
continue continue
@ -433,8 +446,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
for metric in MAXIMIZE_METRICS: for metric in MAXIMIZE_METRICS:
if metric not in new_data or metric not in baseline_data: if metric not in new_data or metric not in baseline_data:
continue continue
threshold_key = f"d_threshold_{metric[2:]}" threshold = get_threshold(baseline_data, metric)
threshold = baseline_data[threshold_key]
baseline_value = baseline_data[metric] baseline_value = baseline_data[metric]
new_value = new_data[metric] new_value = new_data[metric]
# Regressive if new_value < baseline_value * (1 - threshold) # Regressive if new_value < baseline_value * (1 - threshold)
@ -446,8 +458,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
for metric in MINIMIZE_METRICS: for metric in MINIMIZE_METRICS:
if metric not in new_data or metric not in baseline_data: if metric not in new_data or metric not in baseline_data:
continue continue
threshold_key = f"d_threshold_{metric[2:]}" threshold = get_threshold(baseline_data, metric)
threshold = baseline_data.get(threshold_key, 0.1)
baseline_value = baseline_data[metric] baseline_value = baseline_data[metric]
new_value = new_data[metric] new_value = new_data[metric]
# Regressive if new_value > baseline_value * (1 + threshold) # Regressive if new_value > baseline_value * (1 + threshold)
@ -464,10 +475,16 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
baseline_key = f"d_baseline_{metric[2:]}" baseline_key = f"d_baseline_{metric[2:]}"
regressive_data[baseline_key] = baseline_data[metric] regressive_data[baseline_key] = baseline_data[metric]
threshold_key = f"d_threshold_{metric[2:]}" # Copy all threshold keys from baseline
if threshold_key in baseline_data: metric_suffix = metric[2:]
regressive_data[threshold_key] = baseline_data[ for threshold_key in [
threshold_key] f"d_threshold_{metric_suffix}",
f"d_threshold_post_merge_{metric_suffix}",
f"d_threshold_pre_merge_{metric_suffix}"
]:
if threshold_key in baseline_data:
regressive_data[threshold_key] = baseline_data[
threshold_key]
# Add regression info string # Add regression info string
regressive_data["s_regression_info"] = ", ".join(regressive_metrics) regressive_data["s_regression_info"] = ", ".join(regressive_metrics)
@ -478,8 +495,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
return regressive_data_list return regressive_data_list
def prepare_baseline_data(history_baseline_dict, history_data_dict, def prepare_baseline_data(history_data_dict, new_data_dict):
new_data_dict):
""" """
Calculate new baseline from history post-merge data and new data. Calculate new baseline from history post-merge data and new data.
Then return new baseline data. Then return new baseline data.
@ -491,20 +507,19 @@ def prepare_baseline_data(history_baseline_dict, history_data_dict,
# Calculate best metrics from history post-merge data and new data # Calculate best metrics from history post-merge data and new data
best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx], best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx],
new_data_dict[cmd_idx]) new_data_dict[cmd_idx])
new_baseline_data = history_baseline_dict[cmd_idx] new_baseline_data = new_data_dict[cmd_idx].copy()
if new_baseline_data: new_baseline_data["b_is_baseline"] = True
print_info(f"Baseline data found (cmd_idx: {cmd_idx}) in history") # Add or update baseline metrics and thresholds
else:
print_info(
f"No baseline data found (cmd_idx: {cmd_idx}), created a new baseline"
)
new_baseline_data = new_data_dict[cmd_idx].copy()
new_baseline_data["b_is_baseline"] = True
add_id(new_baseline_data)
# Add or update baseline metrics
for metric, value in best_metrics.items(): for metric, value in best_metrics.items():
new_baseline_data[metric] = value new_baseline_data[metric] = value
new_baseline_data[f"d_threshold_{metric[2:]}"] = 0.1 metric_suffix = metric[2:]
post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
new_baseline_data[post_merge_key] = new_baseline_data.get(
post_merge_key, POST_MERGE_THRESHOLD)
new_baseline_data[pre_merge_key] = new_baseline_data.get(
pre_merge_key, PRE_MERGE_THRESHOLD)
add_id(new_baseline_data)
new_baseline_data_dict[cmd_idx] = new_baseline_data new_baseline_data_dict[cmd_idx] = new_baseline_data
return new_baseline_data_dict return new_baseline_data_dict

View File

@ -0,0 +1,185 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import yaml
METRICS = [
"seq_throughput",
"token_throughput",
"total_token_throughput",
"user_throughput",
"mean_tpot",
"median_tpot",
"p99_tpot",
"mean_ttft",
"median_ttft",
"p99_ttft",
"mean_itl",
"median_itl",
"p99_itl",
"mean_e2el",
"median_e2el",
"p99_e2el",
]
def should_skip_execution():
disagg_type = os.getenv("DISAGG_SERVING_TYPE", "")
if (
disagg_type.startswith("GEN")
or disagg_type.startswith("CTX")
or disagg_type == "DISAGG_SERVER"
):
return True
return False
def find_yaml_files(job_workspace, filename):
yaml_files = []
for root, dirs, files in os.walk(job_workspace):
for file in files:
if file == filename:
yaml_files.append(os.path.join(root, file))
return yaml_files
def read_yaml_data(yaml_files):
all_data = []
for file_path in yaml_files:
try:
with open(file_path, "r") as f:
data = yaml.safe_load(f)
if data:
if isinstance(data, list):
all_data.extend(data)
else:
all_data.append(data)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return all_data
def get_metric_keys():
metric_keys = set()
for metric in METRICS:
metric_keys.add(f"d_{metric}")
metric_keys.add(f"d_baseline_{metric}")
metric_keys.add(f"d_threshold_{metric}")
return metric_keys
def print_perf_data(data):
print("=== Metrics ===")
for metric in METRICS:
value_key = f"d_{metric}"
if value_key in data:
value = data.get(value_key, "N/A")
print(f'"{value_key}": {value}')
metric_keys = get_metric_keys()
print("\n=== Config ===")
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
for key in config_keys:
value = data[key]
print(f'"{key}": {value}')
def print_regression_data(data):
if "s_regression_info" in data:
print("=== Regression Info ===")
print(f"{data['s_regression_info']}")
metric_keys = get_metric_keys()
print("=== Metrics ===")
for metric in METRICS:
value_key = f"d_{metric}"
baseline_key = f"d_baseline_{metric}"
threshold_key = f"d_threshold_{metric}"
# Only print if at least one of the keys exists
if value_key in data or baseline_key in data or threshold_key in data:
value = data.get(value_key, "N/A")
baseline = data.get(baseline_key, "N/A")
threshold = data.get(threshold_key, "N/A")
# Calculate percentage difference between value and baseline
if (
isinstance(value, (int, float))
and isinstance(baseline, (int, float))
and baseline != 0
):
percentage = (value - baseline) / baseline * 100
percentage_str = f"{percentage:+.2f}%"
else:
percentage_str = "N/A"
print(
f'"{value_key}": {value}, "{baseline_key}": {baseline}, '
f'"{threshold_key}": {threshold}, "diff": {percentage_str}'
)
print("\n=== Config ===")
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
for key in config_keys:
if key == "s_regression_info":
continue
value = data[key]
print(f'"{key}": {value}')
def main():
if should_skip_execution():
print("Skipping check_perf_regression.py due to DISAGG_SERVING_TYPE")
return 0
job_workspace = sys.argv[1]
if not os.path.isdir(job_workspace):
print(f"Error: {job_workspace} is not a valid directory")
sys.exit(1)
perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
all_perf_data = read_yaml_data(perf_data_files)
print(f"Found {len(all_perf_data)} perf data")
for i, data in enumerate(all_perf_data):
print(f"\n{'=' * 60}")
print(f"Perf Data #{i + 1}")
print("=" * 60)
print_perf_data(data)
print(f"\n{'=' * 60}\n")
regression_files = find_yaml_files(job_workspace, "regression.yaml")
all_regression_data = read_yaml_data(regression_files)
print(f"Found {len(all_regression_data)} regression data")
for i, data in enumerate(all_regression_data):
print(f"\n{'=' * 60}")
print(f"Regression Data #{i + 1}")
print("=" * 60)
print_regression_data(data)
if len(all_regression_data) == 0:
print("\n No regression data found. Perf check is successful.")
return 0
else:
print(
f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed."
)
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@ -19,14 +19,15 @@ import os
import re import re
import shutil import shutil
import socket import socket
import subprocess
import sys import sys
from typing import Dict, List, NamedTuple from typing import Dict, List, NamedTuple
import pytest import pytest
import yaml import yaml
from defs.common import get_cpp_benchmark from defs.common import get_cpp_benchmark
from defs.trt_test_alternative import (is_linux, is_windows, print_info, from defs.trt_test_alternative import (is_linux, is_windows, print_error,
print_warning) print_info, print_warning)
from ..conftest import get_llm_root, llm_models_root, trt_environment from ..conftest import get_llm_root, llm_models_root, trt_environment
from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id, from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id,
@ -227,6 +228,11 @@ def get_model_dir(model_name: str):
return model_dir return model_dir
def get_dataset_path():
return os.path.join(llm_models_root(), "datasets",
"ShareGPT_V3_unfiltered_cleaned_split.json")
def cpu_socket_count_gt_1(): def cpu_socket_count_gt_1():
global MAP_BY_SOCKET global MAP_BY_SOCKET
if MAP_BY_SOCKET is not None: if MAP_BY_SOCKET is not None:
@ -319,37 +325,37 @@ BENCH_PERF_METRIC_LOG_QUERIES = {
AGGR_SERVER_PERF_METRIC_LOG_QUERIES = { AGGR_SERVER_PERF_METRIC_LOG_QUERIES = {
PerfMetricType.SEQ_THROUGHPUT: PerfMetricType.SEQ_THROUGHPUT:
re.compile(r"Request throughput \(req\/s\):\s+([\d\.]+)"), re.compile(r"Request throughput \(req\/s\):\s+(-?[\d\.]+)"),
PerfMetricType.TOKEN_THROUGHPUT: PerfMetricType.TOKEN_THROUGHPUT:
re.compile(r"Output token throughput \(tok\/s\):\s+([\d\.]+)"), re.compile(r"Output token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
PerfMetricType.TOTAL_TOKEN_THROUGHPUT: PerfMetricType.TOTAL_TOKEN_THROUGHPUT:
re.compile(r"Total Token throughput \(tok\/s\):\s+([\d\.]+)"), re.compile(r"Total Token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
PerfMetricType.USER_THROUGHPUT: PerfMetricType.USER_THROUGHPUT:
re.compile(r"User throughput \(tok\/s\):\s+([\d\.]+)"), re.compile(r"User throughput \(tok\/s\):\s+(-?[\d\.]+)"),
PerfMetricType.FIRST_TOKEN_TIME: PerfMetricType.FIRST_TOKEN_TIME:
re.compile(r"Mean TTFT \(ms\):\s+([\d\.]+)"), re.compile(r"Mean TTFT \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.MEDIAN_FIRST_TOKEN_TIME: PerfMetricType.MEDIAN_FIRST_TOKEN_TIME:
re.compile(r"Median TTFT \(ms\):\s+([\d\.]+)"), re.compile(r"Median TTFT \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.P99_FIRST_TOKEN_TIME: PerfMetricType.P99_FIRST_TOKEN_TIME:
re.compile(r"P99 TTFT \(ms\):\s+([\d\.]+)"), re.compile(r"P99 TTFT \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.INTER_TOKEN_TIME: PerfMetricType.INTER_TOKEN_TIME:
re.compile(r"Mean ITL \(ms\):\s+([\d\.]+)"), re.compile(r"Mean ITL \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.MEDIAN_INTER_TOKEN_TIME: PerfMetricType.MEDIAN_INTER_TOKEN_TIME:
re.compile(r"Median ITL \(ms\):\s+([\d\.]+)"), re.compile(r"Median ITL \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.P99_INTER_TOKEN_TIME: PerfMetricType.P99_INTER_TOKEN_TIME:
re.compile(r"P99 ITL \(ms\):\s+([\d\.]+)"), re.compile(r"P99 ITL \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.OUTPUT_TOKEN_TIME: PerfMetricType.OUTPUT_TOKEN_TIME:
re.compile(r"Mean TPOT \(ms\):\s+([\d\.]+)"), re.compile(r"Mean TPOT \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME: PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME:
re.compile(r"Median TPOT \(ms\):\s+([\d\.]+)"), re.compile(r"Median TPOT \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.P99_OUTPUT_TOKEN_TIME: PerfMetricType.P99_OUTPUT_TOKEN_TIME:
re.compile(r"P99 TPOT \(ms\):\s+([\d\.]+)"), re.compile(r"P99 TPOT \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.INFERENCE_TIME: PerfMetricType.INFERENCE_TIME:
re.compile(r"Mean E2EL \(ms\):\s+([\d\.]+)"), re.compile(r"Mean E2EL \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.MEDIAN_INFERENCE_TIME: PerfMetricType.MEDIAN_INFERENCE_TIME:
re.compile(r"Median E2EL \(ms\):\s+([\d\.]+)"), re.compile(r"Median E2EL \(ms\):\s+(-?[\d\.]+)"),
PerfMetricType.P99_INFERENCE_TIME: PerfMetricType.P99_INFERENCE_TIME:
re.compile(r"P99 E2EL \(ms\):\s+([\d\.]+)"), re.compile(r"P99 E2EL \(ms\):\s+(-?[\d\.]+)"),
} }
# (Relative threshold, Absolute threshold) for all metric types # (Relative threshold, Absolute threshold) for all metric types
@ -512,17 +518,21 @@ class ServerConfig:
def __init__(self, server_config_data: dict, env_vars: str = ""): def __init__(self, server_config_data: dict, env_vars: str = ""):
# Extract required fields # Extract required fields
self.mode = server_config_data.get('mode', 'e2e')
self.concurrency = server_config_data.get('concurrency', 1)
self.name = server_config_data['name'] self.name = server_config_data['name']
self.model_name = server_config_data['model_name'] self.model_name = server_config_data['model_name']
self.gpus = server_config_data['gpus']
self.model_path = "" self.model_path = ""
self.env_vars = env_vars self.env_vars = env_vars
# Extract optional fields with defaults # Extract optional fields with defaults
self.tp = server_config_data.get('tensor_parallel_size', self.gpus) self.tp = server_config_data.get('tensor_parallel_size', 1)
self.ep = server_config_data.get('moe_expert_parallel_size', 1) self.ep = server_config_data.get('moe_expert_parallel_size', 1)
self.pp = server_config_data.get('pipeline_parallel_size', 1) self.pp = server_config_data.get('pipeline_parallel_size', 1)
self.gpus_per_node = server_config_data.get('gpus_per_node', self.gpus) self.cp = server_config_data.get('context_parallel_size', 1)
self.gpus = server_config_data.get('gpus', self.tp * self.cp * self.pp)
self.gpus_per_node = server_config_data.get('gpus_per_node',
0) or self.gpus
self.max_num_tokens = server_config_data.get('max_num_tokens', 2048) self.max_num_tokens = server_config_data.get('max_num_tokens', 2048)
self.max_batch_size = server_config_data.get('max_batch_size', 512) self.max_batch_size = server_config_data.get('max_batch_size', 512)
self.max_seq_len = server_config_data.get('max_seq_len', 0) self.max_seq_len = server_config_data.get('max_seq_len', 0)
@ -538,6 +548,8 @@ class ServerConfig:
'enable_attention_dp', False) 'enable_attention_dp', False)
self.trust_remote_code = server_config_data.get('trust_remote_code', self.trust_remote_code = server_config_data.get('trust_remote_code',
False) False)
self.enable_lm_head_tp_in_adp = server_config_data.get(
'enable_lm_head_tp_in_adp', False)
# attention_dp_config # attention_dp_config
attention_dp_config = server_config_data.get('attention_dp_config', {}) attention_dp_config = server_config_data.get('attention_dp_config', {})
@ -551,6 +563,12 @@ class ServerConfig:
moe_config = server_config_data.get('moe_config', {}) moe_config = server_config_data.get('moe_config', {})
self.moe_backend = moe_config.get('backend', "") self.moe_backend = moe_config.get('backend', "")
self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0) self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0)
self.use_low_precision_moe_combine = moe_config.get(
'use_low_precision_moe_combine', False)
load_balancer_config = moe_config.get('load_balancer', {})
self.load_balancer_num_slots = load_balancer_config.get('num_slots', 0)
self.load_balancer_layer_updates_per_iter = load_balancer_config.get(
'layer_updates_per_iter', 0)
# cuda_graph_config # cuda_graph_config
cuda_graph_config = server_config_data.get('cuda_graph_config', {}) cuda_graph_config = server_config_data.get('cuda_graph_config', {})
@ -605,10 +623,13 @@ class ServerConfig:
self.match_mode = server_config_data.get('match_mode', "config") self.match_mode = server_config_data.get('match_mode', "config")
# Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs) # Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs)
exclude_keys = [
'mode', 'concurrency', 'name', 'model_name', 'gpus',
'gpus_per_node', 'client_configs'
]
self.extra_llm_api_config_data = { self.extra_llm_api_config_data = {
k: v k: v
for k, v in server_config_data.items() for k, v in server_config_data.items() if k not in exclude_keys
if k not in ['name', 'model_name', 'gpus', 'client_configs']
} }
def to_cmd(self, def to_cmd(self,
@ -634,8 +655,41 @@ class ServerConfig:
def to_env(self) -> Dict[str, str]: def to_env(self) -> Dict[str, str]:
return to_env_dict(self.env_vars) return to_env_dict(self.env_vars)
def to_match_keys(self) -> List[str]:
return [
"s_mode",
"s_model_name",
"l_tp",
"l_ep",
"l_pp",
"l_cp",
"l_gpus_per_node",
"l_max_batch_size",
"b_disable_overlap_scheduler",
"l_num_postprocess_workers",
"s_attn_backend",
"b_enable_chunked_prefill",
"b_enable_attention_dp",
"b_enable_lm_head_tp_in_adp",
# attention_dp_config
"b_attention_dp_balance",
# moe_config
"s_moe_backend",
# cuda_graph_config
"b_enable_cuda_graph",
# kv_cache_config
"s_kv_cache_dtype",
# cache_transceiver_config
"s_cache_transceiver_backend"
# speculative_config
"s_spec_decoding_type",
"l_num_nextn_predict_layers",
]
def to_db_data(self) -> dict: def to_db_data(self) -> dict:
db_data = { db_data = {
"s_mode":
self.mode,
"s_model_name": "s_model_name":
self.model_name.lower(), self.model_name.lower(),
"l_gpus": "l_gpus":
@ -646,6 +700,8 @@ class ServerConfig:
self.ep, self.ep,
"l_pp": "l_pp":
self.pp, self.pp,
"l_cp":
self.cp,
"l_gpus_per_node": "l_gpus_per_node":
self.gpus_per_node, self.gpus_per_node,
"l_max_num_tokens": "l_max_num_tokens":
@ -668,6 +724,8 @@ class ServerConfig:
self.enable_attention_dp, self.enable_attention_dp,
"b_trust_remote_code": "b_trust_remote_code":
self.trust_remote_code, self.trust_remote_code,
"b_enable_lm_head_tp_in_adp":
self.enable_lm_head_tp_in_adp,
# attention_dp_config # attention_dp_config
"b_attention_dp_balance": "b_attention_dp_balance":
self.attention_dp_balance, self.attention_dp_balance,
@ -680,6 +738,12 @@ class ServerConfig:
self.moe_backend, self.moe_backend,
"l_moe_max_num_tokens": "l_moe_max_num_tokens":
self.moe_max_num_tokens, self.moe_max_num_tokens,
"b_use_low_precision_moe_combine":
self.use_low_precision_moe_combine,
"l_load_balancer_num_slots":
self.load_balancer_num_slots,
"l_load_balancer_layer_updates_per_iter":
self.load_balancer_layer_updates_per_iter,
# cuda_graph_config # cuda_graph_config
"b_enable_cuda_graph": "b_enable_cuda_graph":
self.enable_cuda_graph, self.enable_cuda_graph,
@ -754,7 +818,7 @@ class ClientConfig:
self.osl = client_config_data.get('osl', 1024) self.osl = client_config_data.get('osl', 1024)
self.random_range_ratio = client_config_data.get( self.random_range_ratio = client_config_data.get(
'random_range_ratio', 0.0) 'random_range_ratio', 0.0)
self.backend = client_config_data.get('backend', "") self.backend = client_config_data.get('backend', "openai")
self.use_chat_template = client_config_data.get('use_chat_template', self.use_chat_template = client_config_data.get('use_chat_template',
False) False)
self.streaming = client_config_data.get('streaming', True) self.streaming = client_config_data.get('streaming', True)
@ -765,18 +829,36 @@ class ClientConfig:
model_dir = get_model_dir(self.model_name) model_dir = get_model_dir(self.model_name)
self.model_path = model_dir if os.path.exists( self.model_path = model_dir if os.path.exists(
model_dir) else self.model_name model_dir) else self.model_name
dataset_path = get_dataset_path()
benchmark_cmd = [ benchmark_cmd = [
"python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving", "python",
"--model", self.model_path, "--dataset-name", "random", "-m",
"--random-ids", "--num-prompts", "tensorrt_llm.serve.scripts.benchmark_serving",
str(self.concurrency * self.iterations), "--random-input-len", "--model",
str(self.isl), "--random-output-len", self.model_path,
str(self.osl), "--random-range-ratio", "--tokenizer",
str(self.random_range_ratio), "--ignore-eos", self.model_path,
"--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency", "--dataset-name",
str(self.concurrency) "random",
"--random-ids",
"--num-prompts",
str(self.concurrency * self.iterations),
"--max-concurrency",
str(self.concurrency),
"--random-input-len",
str(self.isl),
"--random-output-len",
str(self.osl),
"--random-range-ratio",
str(self.random_range_ratio),
"--trust-remote-code",
"--ignore-eos",
"--percentile-metrics",
"ttft,tpot,itl,e2el",
] ]
if dataset_path and os.path.exists(dataset_path):
benchmark_cmd.append("--dataset-path")
benchmark_cmd.append(dataset_path)
if self.backend: if self.backend:
benchmark_cmd.append("--backend") benchmark_cmd.append("--backend")
benchmark_cmd.append(self.backend) benchmark_cmd.append(self.backend)
@ -789,6 +871,18 @@ class ClientConfig:
def to_env(self) -> Dict[str, str]: def to_env(self) -> Dict[str, str]:
return to_env_dict(self.env_vars) return to_env_dict(self.env_vars)
def to_match_keys(self) -> List[str]:
return [
"l_concurrency",
"l_iterations",
"l_isl",
"l_osl",
"d_random_range_ratio",
"s_backend",
"b_use_chat_template",
"b_streaming",
]
def to_db_data(self) -> dict: def to_db_data(self) -> dict:
"""Convert ClientConfig to Database data""" """Convert ClientConfig to Database data"""
db_data = { db_data = {
@ -867,36 +961,37 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None):
else: else:
execution_plan = None execution_plan = None
# Read YAML config file
with open(config_file_path, 'r') as f: with open(config_file_path, 'r') as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
# Read environment config metadata = config.get('metadata', {})
environment = config.get('environment', {}) environment = config.get('environment', {})
if not environment: hardware = config.get('hardware', {})
environment = {} gpus_per_node = hardware.get('gpus_per_node', 0)
# Get environment variables model_name = metadata.get('model_name', '')
environment.get('worker_env_var', '')
server_env_var = environment.get('server_env_var', '') server_env_var = environment.get('server_env_var', '')
client_env_var = environment.get('client_env_var', '') client_env_var = environment.get('client_env_var', '')
server_configs = [] server_configs = []
server_client_configs = {} server_client_configs = {}
for server_config_data in config['server_configs']: for server_config_data in config['server_configs']:
server_name = server_config_data['name'] server_name = server_config_data['name']
server_config_data[
'model_name'] = model_name if 'model_name' not in server_config_data else server_config_data[
'model_name']
server_config_data['mode'] = 'e2e'
server_config_data['concurrency'] = -1
server_config_data['gpus_per_node'] = gpus_per_node
# Check if this server should be included based on execution_plan # Check if this server should be included based on execution_plan
if execution_plan is not None and server_name not in execution_plan: if execution_plan is not None and server_name not in execution_plan:
continue continue
# Create ServerConfig object directly from dict
server_config = ServerConfig(server_config_data, server_env_var) server_config = ServerConfig(server_config_data, server_env_var)
server_id = len(server_configs) server_id = len(server_configs)
server_configs.append(server_config) server_configs.append(server_config)
# Create ClientConfig objects
client_configs = [] client_configs = []
selected_client_names = execution_plan.get( selected_client_names = execution_plan.get(
server_name) if execution_plan else None server_name) if execution_plan else None
@ -905,7 +1000,6 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None):
client_name = client_config_data['name'] client_name = client_config_data['name']
# Check if this client should be included # Check if this client should be included
# Include if: execution_plan is None OR selected_client_names is None OR client_name in selected_client_names
if execution_plan is not None and selected_client_names is not None: if execution_plan is not None and selected_client_names is not None:
if client_name not in selected_client_names: if client_name not in selected_client_names:
continue continue
@ -929,46 +1023,48 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
config = yaml.safe_load(f) config = yaml.safe_load(f)
disagg_configs = [] disagg_configs = []
metadata = config.get('metadata', {})
hardware = config.get('hardware', {}) hardware = config.get('hardware', {})
benchmark = config.get('benchmark', {}) benchmark = config.get('benchmark', {})
environment = config.get('environment', {}) environment = config.get('environment', {})
slurm_config = config.get('slurm', {}) slurm_config = config.get('slurm', {})
worker_config = config.get('worker_config', {}) worker_config = config.get('worker_config', {})
timeout = slurm_config.get('timeout', 3600) timeout = slurm_config.get('timeout', 7200)
numa_bind = slurm_config.get('numa_bind', False) numa_bind = slurm_config.get('numa_bind', False)
gpus_per_node = hardware.get('gpus_per_node', 0)
model_name = metadata.get('model_name', '')
assert model_name, "model_name is required in metadata section"
# Get model name from environment benchmark_mode = benchmark.get('mode', 'e2e')
model_name = environment.get('model_name', '') if "gen_only" in benchmark_mode:
assert model_name, "model_name is required in environment section" hardware['num_ctx_servers'] = 0
# Get environment variables
worker_env_var = environment.get('worker_env_var', '') worker_env_var = environment.get('worker_env_var', '')
server_env_var = environment.get('server_env_var', '') server_env_var = environment.get('server_env_var', '')
client_env_var = environment.get('client_env_var', '') client_env_var = environment.get('client_env_var', '')
# Create ctx_server config data concurrency_str = benchmark.get('concurrency_list', '1')
if isinstance(concurrency_str, str):
concurrency = max(int(x) for x in concurrency_str.split())
else:
concurrency = int(concurrency_str)
ctx_server_config_data = { ctx_server_config_data = {
'mode': benchmark_mode,
'concurrency': concurrency,
'name': 'ctx', 'name': 'ctx',
'model_name': model_name, 'model_name': model_name,
'gpus': hardware.get('gpus_per_ctx_server'), 'gpus_per_node': gpus_per_node,
'gpus_per_node': hardware.get('gpus_per_node'),
**worker_config.get('ctx', {}) **worker_config.get('ctx', {})
} }
# Create gen_server config data
gen_server_config_data = { gen_server_config_data = {
'mode': benchmark_mode,
'concurrency': concurrency,
'name': 'gen', 'name': 'gen',
'model_name': model_name, 'model_name': model_name,
'gpus': hardware.get('gpus_per_gen_server'), 'gpus_per_node': gpus_per_node,
'gpus_per_node': hardware.get('gpus_per_node'),
**worker_config.get('gen', {}) **worker_config.get('gen', {})
} }
# Create client config data
concurrency_str = benchmark.get('concurrency_list', '1')
concurrency = int(concurrency_str) if isinstance(concurrency_str,
str) else concurrency_str
client_config_data = { client_config_data = {
'name': 'client', 'name': 'client',
'concurrency': concurrency, 'concurrency': concurrency,
@ -980,13 +1076,12 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
'use_chat_template': False, 'use_chat_template': False,
'streaming': benchmark.get('streaming', True), 'streaming': benchmark.get('streaming', True),
} }
# Create disagg_config dict
disagg_config = { disagg_config = {
'disagg_serving_type': disagg_serving_type, 'disagg_serving_type': disagg_serving_type,
'hostname': socket.gethostname(), 'hostname': socket.gethostname(),
'numa_bind': numa_bind, 'numa_bind': numa_bind,
'timeout': timeout, 'timeout': timeout,
'mode': benchmark_mode,
'name': 'disagg_config', 'name': 'disagg_config',
'model_name': model_name, 'model_name': model_name,
'hardware': hardware, 'hardware': hardware,
@ -995,9 +1090,7 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
'server_env_var': server_env_var, 'server_env_var': server_env_var,
'client': ClientConfig(client_config_data, model_name, client_env_var), 'client': ClientConfig(client_config_data, model_name, client_env_var),
} }
print_info(f"disagg_config: {disagg_config}")
disagg_configs.append(disagg_config) disagg_configs.append(disagg_config)
return disagg_configs return disagg_configs
@ -1114,6 +1207,8 @@ class PerfTestConfig:
self.upload_to_db = False self.upload_to_db = False
self.config_file = None self.config_file = None
self.gpu_type = None self.gpu_type = None
self.config_dir = None
self.config_file = None
self.config_path = None self.config_path = None
self.select_pattern = None self.select_pattern = None
# Aggregated mode # Aggregated mode
@ -1330,35 +1425,47 @@ class PerfTestConfig:
# Extract configs from test param labels. # Extract configs from test param labels.
labels = test_param_labels.split("-") labels = test_param_labels.split("-")
def get_gpu_type(label: str) -> str: def get_gpu_type() -> str:
parts = label.split("_") try:
if len(parts) < 2 or parts[0] != "l0": output = subprocess.check_output(["nvidia-smi", "-L"],
return "" stderr=subprocess.DEVNULL,
if parts[1] == "dgx": text=True)
if len(parts) >= 3: first_line = output.strip().split("\n")[0]
gpu_type = f"{parts[1]}_{parts[2]}" gpu_models = ["GB300", "GB200", "B300", "B200"]
else: for model in gpu_models:
gpu_type = "" if model in first_line:
else: if model.startswith("B") and not model.startswith("GB"):
gpu_type = parts[1] return f"dgx_{model.lower()}"
return gpu_type.lower() return model.lower()
except (subprocess.CalledProcessError, FileNotFoundError,
IndexError):
print_error(
f"Failed to get GPU type: {subprocess.CalledProcessError}")
return ""
# Used for perf sanity test
if "perf_sanity" in labels[0]: if "perf_sanity" in labels[0]:
assert len(labels) > 1, "perf_sanity test must have a config file!" assert len(labels) > 1, "perf_sanity test must have a config file!"
is_disagg = "disagg" in labels[0]
self.upload_to_db = "upload" in labels[0] self.upload_to_db = "upload" in labels[0]
self.config_file = labels[1] self.gpu_type = get_gpu_type()
if "disagg" in labels[1]: if is_disagg:
# For disagg, test name is like: perf_sanity_disagg-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX
# labels[0] is perf_sanity_disagg, "-".join(labels[1:]) is config file base name
self.runtime = "multi_node_disagg_server" self.runtime = "multi_node_disagg_server"
self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
config_base = "-".join(labels[1:])
self.config_file = f"{config_base}.yaml" if not config_base.endswith(
".yaml") else config_base
self.select_pattern = None
else: else:
# For aggr, test name is like: perf_sanity_aggr-l0_dgx_b300-r1_fp8_dep8_mtp1_1k1k
# labels[0] is perf_sanity_aggr, labels[1] is config file base name, labels[2] is select_pattern (optional)
self.runtime = "aggr_server" self.runtime = "aggr_server"
self.gpu_type = get_gpu_type(labels[1]) self.config_dir = "tests/scripts/perf-sanity"
config_folder = os.getenv("TRTLLM_CONFIG_FOLDER", config_base = labels[1]
"tests/scripts/perf-sanity") self.config_file = f"{config_base}.yaml" if config_base and not config_base.endswith(
self.config_path = os.path.join( ".yaml") else config_base
config_folder, f"{labels[1]}.yaml" self.select_pattern = labels[2] if len(labels) > 2 else None
if not labels[1].endswith(".yaml") else labels[1])
self.select_pattern = labels[2] if len(labels) > 2 else None
return return
self.model_name = labels.pop(0) self.model_name = labels.pop(0)
@ -1578,21 +1685,19 @@ class PerfTestConfig:
[b >= 32 for b in self.batch_sizes] [b >= 32 for b in self.batch_sizes]
), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32." ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32."
def set_aggr_server_configs(self, llm_root: str) -> None: def set_aggr_server_configs(self) -> None:
""" """
Set the server and client configs. Set the server and client configs.
""" """
config_file_path = os.path.join(llm_root, self.config_path)
_, self.server_configs, self.server_client_configs = parse_aggr_config_file( _, self.server_configs, self.server_client_configs = parse_aggr_config_file(
config_file_path, self.select_pattern) self.config_path, self.select_pattern)
def set_multi_node_disagg_server_configs(self, llm_root: str) -> None: def set_multi_node_disagg_server_configs(self) -> None:
""" """
Set the multi-node disaggregated server configs. Set the multi-node disaggregated server configs.
""" """
config_file_path = os.path.join(llm_root, self.config_path)
self.disagg_configs = parse_multi_node_disagg_config_file( self.disagg_configs = parse_multi_node_disagg_config_file(
config_file_path, self.select_pattern) self.config_path, self.select_pattern)
def get_model_family(self) -> str: def get_model_family(self) -> str:
""" """
@ -1682,6 +1787,13 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
output_dir, output_dir,
perf_cache_fpath, perf_cache_fpath,
gpu_clock_lock=None) -> None: gpu_clock_lock=None) -> None:
if self._config.runtime == "aggr_server" or self._config.runtime == "multi_node_disagg_server":
self._config.config_dir = os.getenv(
"TRTLLM_CONFIG_FOLDER",
os.path.join(llm_root, self._config.config_dir))
self._config.config_path = os.path.join(self._config.config_dir,
self._config.config_file)
if self._config.runtime == "cpp": if self._config.runtime == "cpp":
if not self._config.is_bert_like(): if not self._config.is_bert_like():
raise ValueError( raise ValueError(
@ -1695,12 +1807,12 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
benchmark_script = "trtllm-bench" benchmark_script = "trtllm-bench"
elif self._config.runtime == "aggr_server": elif self._config.runtime == "aggr_server":
benchmark_script = None benchmark_script = None
self._config.set_aggr_server_configs(llm_root) self._config.set_aggr_server_configs()
elif self._config.runtime == "disagg_server": elif self._config.runtime == "disagg_server":
benchmark_script = None benchmark_script = None
elif self._config.runtime == "multi_node_disagg_server": elif self._config.runtime == "multi_node_disagg_server":
benchmark_script = None benchmark_script = None
self._config.set_multi_node_disagg_server_configs(llm_root) self._config.set_multi_node_disagg_server_configs()
else: else:
raise RuntimeError(f"Invalid runtime {self._config.runtime}.") raise RuntimeError(f"Invalid runtime {self._config.runtime}.")
@ -1730,15 +1842,12 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
def get_trtllm_aggr_commands(self, output_dir): def get_trtllm_aggr_commands(self, output_dir):
server_cmds = [] server_cmds = []
server_envs = []
client_cmds = [] client_cmds = []
client_envs = []
names = [] names = []
for server_idx, client_configs in self._config.server_client_configs.items( for server_idx, client_configs in self._config.server_client_configs.items(
): ):
server_config = self._config.server_configs[server_idx] server_config = self._config.server_configs[server_idx]
server_cmd = server_config.to_cmd(output_dir) server_cmd = server_config.to_cmd(output_dir)
server_env = server_config.to_env()
# Generate extra-llm-api-config.yml # Generate extra-llm-api-config.yml
config_content = server_config.generate_extra_llm_api_config() config_content = server_config.generate_extra_llm_api_config()
config_filename = f"extra-llm-api-config.{server_config.name}.yml" config_filename = f"extra-llm-api-config.{server_config.name}.yml"
@ -1747,49 +1856,35 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
f.write(config_content) f.write(config_content)
for client_config in client_configs: for client_config in client_configs:
server_cmds.append(server_cmd) server_cmds.append(server_cmd)
server_envs.append(server_env)
client_cmd = client_config.to_cmd() client_cmd = client_config.to_cmd()
client_env = client_config.to_env()
client_cmds.append(client_cmd) client_cmds.append(client_cmd)
client_envs.append(client_env)
names.append(f"{server_config.name}-{client_config.name}") names.append(f"{server_config.name}-{client_config.name}")
return server_cmds, server_envs, client_cmds, client_envs, names return server_cmds, client_cmds, names
def get_trtllm_multi_node_disagg_commands(self, output_dir): def get_trtllm_multi_node_disagg_commands(self, output_dir):
ctx_server_cmds = [] ctx_server_cmds = []
ctx_server_envs = []
gen_server_cmds = [] gen_server_cmds = []
gen_server_envs = []
disagg_server_cmds = [] disagg_server_cmds = []
disagg_server_envs = []
benchmark_cmds = [] benchmark_cmds = []
benchmark_envs = []
cmd_idx = 0 cmd_idx = 0
for disagg_config in self._config.disagg_configs: for disagg_config in self._config.disagg_configs:
disagg_serving_type = disagg_config['disagg_serving_type'] disagg_serving_type = disagg_config['disagg_serving_type']
disagg_config['hostname'] disagg_config['hostname']
numa_bind = disagg_config['numa_bind'] numa_bind = disagg_config['numa_bind']
ctx_server_cmd = None ctx_server_cmd = None
ctx_server_env = None
gen_server_cmd = None gen_server_cmd = None
gen_server_env = None
disagg_server_cmd = None disagg_server_cmd = None
disagg_server_env = None
benchmark_cmd = None benchmark_cmd = None
benchmark_env = None
if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type: if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type:
is_ctx = "CTX" in disagg_serving_type is_ctx = "CTX" in disagg_serving_type
server_config = disagg_config[ server_config = disagg_config[
'ctx_server'] if is_ctx else disagg_config['gen_server'] 'ctx_server'] if is_ctx else disagg_config['gen_server']
server_cmd = server_config.to_cmd(output_dir, numa_bind, server_cmd = server_config.to_cmd(output_dir, numa_bind,
disagg_serving_type) disagg_serving_type)
server_env = server_config.to_env()
if is_ctx: if is_ctx:
ctx_server_cmd = server_cmd ctx_server_cmd = server_cmd
ctx_server_env = server_env
else: else:
gen_server_cmd = server_cmd gen_server_cmd = server_cmd
gen_server_env = server_env
# Generate extra-llm-api-config.yml # Generate extra-llm-api-config.yml
config_content = server_config.generate_extra_llm_api_config() config_content = server_config.generate_extra_llm_api_config()
config_filename = f"extra-llm-api-config.{server_config.name}.yml" config_filename = f"extra-llm-api-config.{server_config.name}.yml"
@ -1805,21 +1900,15 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
str(timeout), "-r", str(timeout), "-r",
str(timeout) str(timeout)
] ]
disagg_server_env = to_env_dict(disagg_config['server_env_var'])
elif "BENCHMARK" in disagg_serving_type: elif "BENCHMARK" in disagg_serving_type:
# Generate benchmark command if this is the BENCHMARK server node # Generate benchmark command if this is the BENCHMARK server node
benchmark_cmd = disagg_config['client'].to_cmd() benchmark_cmd = disagg_config['client'].to_cmd()
benchmark_env = disagg_config['client'].to_env()
ctx_server_cmds.append(ctx_server_cmd) ctx_server_cmds.append(ctx_server_cmd)
ctx_server_envs.append(ctx_server_env)
gen_server_cmds.append(gen_server_cmd) gen_server_cmds.append(gen_server_cmd)
gen_server_envs.append(gen_server_env)
disagg_server_cmds.append(disagg_server_cmd) disagg_server_cmds.append(disagg_server_cmd)
disagg_server_envs.append(disagg_server_env)
benchmark_cmds.append(benchmark_cmd) benchmark_cmds.append(benchmark_cmd)
benchmark_envs.append(benchmark_env)
cmd_idx += 1 cmd_idx += 1
return ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs return ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds
def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list: def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list:
build_cmd = [ build_cmd = [
@ -2094,12 +2183,10 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
if is_aggr: if is_aggr:
if not os.path.exists(perf_sanity_output_dir): if not os.path.exists(perf_sanity_output_dir):
os.makedirs(perf_sanity_output_dir, exist_ok=True) os.makedirs(perf_sanity_output_dir, exist_ok=True)
server_cmds, server_envs, client_cmds, client_envs, names = self.get_trtllm_aggr_commands( server_cmds, client_cmds, names = self.get_trtllm_aggr_commands(
perf_sanity_output_dir) perf_sanity_output_dir)
return PerfAggrScriptTestCmds(server_cmds=server_cmds, return PerfAggrScriptTestCmds(server_cmds=server_cmds,
server_envs=server_envs,
client_cmds=client_cmds, client_cmds=client_cmds,
client_envs=client_envs,
names=names, names=names,
timeout=3600, timeout=3600,
output_dir=perf_sanity_output_dir) output_dir=perf_sanity_output_dir)
@ -2115,17 +2202,13 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
if is_multi_node_disagg: if is_multi_node_disagg:
if not os.path.exists(perf_sanity_output_dir): if not os.path.exists(perf_sanity_output_dir):
os.makedirs(perf_sanity_output_dir, exist_ok=True) os.makedirs(perf_sanity_output_dir, exist_ok=True)
ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs = self.get_trtllm_multi_node_disagg_commands( ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds = self.get_trtllm_multi_node_disagg_commands(
perf_sanity_output_dir) perf_sanity_output_dir)
return PerfMultiNodeDisaggScriptTestCmds( return PerfMultiNodeDisaggScriptTestCmds(
ctx_server_cmds=ctx_server_cmds, ctx_server_cmds=ctx_server_cmds,
ctx_server_envs=ctx_server_envs,
gen_server_cmds=gen_server_cmds, gen_server_cmds=gen_server_cmds,
gen_server_envs=gen_server_envs,
disagg_server_cmds=disagg_server_cmds, disagg_server_cmds=disagg_server_cmds,
disagg_server_envs=disagg_server_envs,
benchmark_cmds=benchmark_cmds, benchmark_cmds=benchmark_cmds,
benchmark_envs=benchmark_envs,
timeout=self._config.disagg_configs[0]['timeout'], timeout=self._config.disagg_configs[0]['timeout'],
hostname=self._config.disagg_configs[0]['hostname'], hostname=self._config.disagg_configs[0]['hostname'],
disagg_serving_type=self._config.disagg_configs[0] disagg_serving_type=self._config.disagg_configs[0]
@ -2156,6 +2239,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
build_cmd = self.get_trtllm_bench_build_command(engine_dir) build_cmd = self.get_trtllm_bench_build_command(engine_dir)
else: else:
pytest.skip("only support trtllm-bench runtime for now") pytest.skip("only support trtllm-bench runtime for now")
# Construct prepare synthetic data command # Construct prepare synthetic data command
data_cmds = [] data_cmds = []
@ -2293,32 +2377,24 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
#print info to separate cases #print info to separate cases
self._current_cmd_idx = 0 self._current_cmd_idx = 0
metrics = self._get_metrics() metrics = self._get_metrics()
commands = self.get_commands()
outputs = {} outputs = {}
result_states = {} result_states = {}
errors = [] errors = []
def add_myelin_time_pass_to(input_env): # Only trtllm-bench needs to prepare dataset first.
time_pass_flag = r" -time_pass=on"
old_myelin_env = input_env.get("__LUNOWUD", "")
if time_pass_flag not in old_myelin_env:
input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
return old_myelin_env
old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
if self._config.runtime == 'bench': if self._config.runtime == 'bench':
#prepare dataset first for trtllm-bench
print_info(f"Running command for generating dataset") print_info(f"Running command for generating dataset")
outputs = self.run_ex("prepare_dataset", outputs = self.run_ex(commands=commands,
None, cmd_idx=self._current_cmd_idx,
llm_venv, full_test_name="prepare_dataset",
gpu_clock_lock, metric_type=None,
session_data_writer, venv=llm_venv,
output_dir, gpu_clock_lock=gpu_clock_lock,
session_data_writer=session_data_writer,
output_dir=output_dir,
outputs=outputs, outputs=outputs,
original_test_name="prepare_dataset", original_test_name="prepare_dataset")
cmd_idx=self._current_cmd_idx)
# Save the result state.
result_state = self.get_result_state() result_state = self.get_result_state()
result_states[self._current_cmd_idx] = result_state result_states[self._current_cmd_idx] = result_state
if result_state != "valid": if result_state != "valid":
@ -2349,15 +2425,16 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
# Run the command or reuse the existing output logs. # Run the command or reuse the existing output logs.
print_info(f"Running command for {metric.metric_name}") print_info(f"Running command for {metric.metric_name}")
outputs = self.run_ex( outputs = self.run_ex(
metric.metric_name, commands=commands,
metric.metric_type, cmd_idx=self._current_cmd_idx,
llm_venv, full_test_name=metric.metric_name,
gpu_clock_lock, metric_type=metric.metric_type,
session_data_writer, venv=llm_venv,
output_dir, gpu_clock_lock=gpu_clock_lock,
session_data_writer=session_data_writer,
output_dir=output_dir,
outputs=outputs, outputs=outputs,
original_test_name=metric.original_test_name, original_test_name=metric.original_test_name)
cmd_idx=self._current_cmd_idx)
# Save the result state. # Save the result state.
result_state = self.get_result_state() result_state = self.get_result_state()
@ -2373,6 +2450,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
# Clean up engine dir after use. # Clean up engine dir after use.
shutil.rmtree(self._get_engine_dir(), ignore_errors=True) shutil.rmtree(self._get_engine_dir(), ignore_errors=True)
def add_myelin_time_pass_to(input_env):
time_pass_flag = r" -time_pass=on"
old_myelin_env = input_env.get("__LUNOWUD", "")
if time_pass_flag not in old_myelin_env:
input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
return old_myelin_env
old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
llm_venv._new_env["__LUNOWUD"] = old_llm_venv llm_venv._new_env["__LUNOWUD"] = old_llm_venv
# Check if any commands failed. # Check if any commands failed.
@ -2393,14 +2478,19 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
Upload the test results and baseline to database. Upload the test results and baseline to database.
""" """
def prefix_server_config_dict(config_dict: dict, def add_prefix(key: str, prefix_name: str) -> dict:
prefix_name: str) -> dict: type_prefix = key[0:2] # 'l_', 's_', 'b_', 'd_'
prefixed_dict = {} rest = key[2:]
for key, value in config_dict.items(): return f"{type_prefix}{prefix_name}_{rest}"
type_prefix = key[0:2] # 'l_', 's_', 'b_', 'd_'
rest = key[2:] def add_list_prefix(config_list: List, prefix_name: str) -> List:
prefixed_dict[f"{type_prefix}{prefix_name}_{rest}"] = value return [add_prefix(key, prefix_name) for key in config_list]
return prefixed_dict
def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict:
return {
add_prefix(key, prefix_name): value
for key, value in config_dict.items()
}
match_keys = [] match_keys = []
# Only aggr_server and multi_node_disagg_server will upload. # Only aggr_server and multi_node_disagg_server will upload.
@ -2441,12 +2531,12 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
new_data_dict[cmd_idx] = new_data new_data_dict[cmd_idx] = new_data
cmd_idx += 1 cmd_idx += 1
if not match_keys: if not match_keys:
match_keys.append("s_runtime")
if server_config.match_mode == "scenario": if server_config.match_mode == "scenario":
match_keys = SCENARIO_MATCH_FIELDS.copy() match_keys = SCENARIO_MATCH_FIELDS.copy()
else: else:
match_keys.append("s_runtime") match_keys.extend(server_config.to_match_keys())
match_keys.extend(server_config_dict.keys()) match_keys.extend(client_config.to_match_keys())
match_keys.extend(client_config_dict.keys())
elif self._config.runtime == "multi_node_disagg_server": elif self._config.runtime == "multi_node_disagg_server":
if self._config.disagg_configs[0][ if self._config.disagg_configs[0][
@ -2472,27 +2562,28 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
) )
gen_server_config_dict = disagg_config['gen_server'].to_db_data( gen_server_config_dict = disagg_config['gen_server'].to_db_data(
) )
ctx_server_config_dict = prefix_server_config_dict(
ctx_server_config_dict, 'ctx')
gen_server_config_dict = prefix_server_config_dict(
gen_server_config_dict, 'gen')
client_config_dict = disagg_config['client'].to_db_data() client_config_dict = disagg_config['client'].to_db_data()
# Build new_data ctx_server_config_dict = add_dict_prefix(
ctx_server_config_dict, 'ctx')
gen_server_config_dict = add_dict_prefix(
gen_server_config_dict, 'gen')
hardware = disagg_config.get('hardware', {})
num_ctx_servers = hardware.get('num_ctx_servers', 0)
num_gen_servers = hardware.get('num_gen_servers', 0)
new_data = { new_data = {
"s_runtime": "multi_node_disagg_server", "s_runtime": "multi_node_disagg_server",
"s_server_env_var": disagg_config['server_env_var'] "s_benchmark_mode": disagg_config['mode'],
"s_server_env_var": disagg_config['server_env_var'],
"l_num_ctx_servers": num_ctx_servers,
"l_num_gen_servers": num_gen_servers
} }
new_data.update(job_config) new_data.update(job_config)
new_data.update(ctx_server_config_dict) if num_ctx_servers > 0:
new_data.update(gen_server_config_dict) new_data.update(ctx_server_config_dict)
if num_gen_servers > 0:
new_data.update(gen_server_config_dict)
new_data.update(client_config_dict) new_data.update(client_config_dict)
# Add hardware information
hardware = disagg_config.get('hardware', {})
new_data["l_num_ctx_servers"] = hardware.get(
'num_ctx_servers', 0)
new_data["l_num_gen_servers"] = hardware.get(
'num_gen_servers', 0)
# Add metrics from test results
for metric_type in AGGR_SERVER_METRICS: for metric_type in AGGR_SERVER_METRICS:
new_data[ new_data[
f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[ f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[
@ -2503,9 +2594,17 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
if not match_keys: if not match_keys:
match_keys.extend( match_keys.extend(
["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"]) ["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"])
match_keys.extend(ctx_server_config_dict.keys()) if num_ctx_servers > 0:
match_keys.extend(gen_server_config_dict.keys()) match_keys.extend(
match_keys.extend(client_config_dict.keys()) add_list_prefix(
disagg_config['ctx_server'].to_match_keys(),
'ctx'))
if num_gen_servers > 0:
match_keys.extend(
add_list_prefix(
disagg_config['gen_server'].to_match_keys(),
'gen'))
match_keys.extend(disagg_config['client'].to_match_keys())
else: else:
return return
@ -2519,7 +2618,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
if is_post_merge: if is_post_merge:
# Prepare new baseline data for post-merge # Prepare new baseline data for post-merge
new_baseline_data_dict = prepare_baseline_data( new_baseline_data_dict = prepare_baseline_data(
history_baseline_dict, history_data_dict, new_data_dict) history_data_dict, new_data_dict)
else: else:
# Pre-merge does not need to upload baseline data # Pre-merge does not need to upload baseline data
new_baseline_data_dict = None new_baseline_data_dict = None

View File

@ -245,9 +245,7 @@ class PerfBenchScriptTestCmds(NamedTuple):
class PerfAggrScriptTestCmds(NamedTuple): class PerfAggrScriptTestCmds(NamedTuple):
server_cmds: List[List[str]] server_cmds: List[List[str]]
server_envs: List[Dict[str, str]]
client_cmds: List[List[str]] client_cmds: List[List[str]]
client_envs: List[Dict[str, str]]
names: List[str] names: List[str]
timeout: int timeout: int
output_dir: str output_dir: str
@ -345,13 +343,9 @@ class PerfDisaggScriptTestCmds(NamedTuple):
class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
ctx_server_cmds: List[List[str]] ctx_server_cmds: List[List[str]]
ctx_server_envs: List[Dict[str, str]]
gen_server_cmds: List[List[str]] gen_server_cmds: List[List[str]]
gen_server_envs: List[Dict[str, str]]
disagg_server_cmds: List[List[str]] disagg_server_cmds: List[List[str]]
disagg_server_envs: List[Dict[str, str]]
benchmark_cmds: List[List[str]] benchmark_cmds: List[List[str]]
benchmark_envs: List[Dict[str, str]]
timeout: int timeout: int
hostname: str hostname: str
disagg_serving_type: str disagg_serving_type: str
@ -694,23 +688,21 @@ class AbstractPerfScriptTestClass(abc.ABC):
) )
def run_ex(self, def run_ex(self,
commands,
full_test_name: str, full_test_name: str,
metric_type: PerfMetricType, metric_type: PerfMetricType,
venv: Optional[PythonVenvRunnerImpl], venv: Optional[PythonVenvRunnerImpl],
gpu_clock_lock: GPUClockLock, gpu_clock_lock: GPUClockLock,
session_data_writer: SessionDataWriter, session_data_writer: SessionDataWriter,
output_dir: str, output_dir: str,
cmd_idx: int = 0,
outputs: Dict[int, str] = {}, outputs: Dict[int, str] = {},
original_test_name: str = None, original_test_name: str = None,
cmd_idx: int = 0,
**kwargs) -> List[str]: **kwargs) -> List[str]:
""" """
Run the commands and write the results to the output csv and/or yaml files. Run the commands and write the results to the output csv and/or yaml files.
""" """
# Get the commands.
commands = self.get_commands()
# Avoid modifying argument directly # Avoid modifying argument directly
outputs = outputs.copy() outputs = outputs.copy()
@ -723,7 +715,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
cmd_str = commands.get_cmd_str(cmd_idx) cmd_str = commands.get_cmd_str(cmd_idx)
is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str
is_perf_sanity_test = "perf_sanity" in full_test_name is_perf_sanity_test = "perf_sanity" in full_test_name
is_disagg_server = False is_disagg_server = False
@ -804,7 +795,8 @@ class AbstractPerfScriptTestClass(abc.ABC):
outputs.pop(cmd_idx) outputs.pop(cmd_idx)
elif is_disagg_server: elif is_disagg_server:
print_info( print_info(
f"skip writing perf result when running disagg's server.") f"skip writing perf result when running disagg's worker or server."
)
else: else:
self._perf_result = self.get_perf_result(outputs) self._perf_result = self.get_perf_result(outputs)

View File

@ -15,9 +15,9 @@ l0_dgx_b200_perf_sanity:
backend: pytorch backend: pytorch
orchestrator: mpi orchestrator: mpi
tests: tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
- condition: - condition:
ranges: ranges:
@ -34,8 +34,8 @@ l0_dgx_b200_perf_sanity:
backend: pytorch backend: pytorch
orchestrator: mpi orchestrator: mpi
tests: tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)

View File

@ -16,9 +16,9 @@ l0_dgx_b300_perf_sanity:
backend: pytorch backend: pytorch
orchestrator: mpi orchestrator: mpi
tests: tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
- condition: - condition:
ranges: ranges:
@ -36,6 +36,6 @@ l0_dgx_b300_perf_sanity:
backend: pytorch backend: pytorch
orchestrator: mpi orchestrator: mpi
tests: tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)

View File

@ -14,6 +14,6 @@ l0_gb200_multi_gpus_perf_sanity:
stage: post_merge stage: post_merge
backend: pytorch backend: pytorch
tests: tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_1k1k] - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k] - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_1k1k] - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k]

View File

@ -1,5 +1,5 @@
version: 0.0.1 version: 0.0.1
l0_gb200_multi_nodes_perf_sanity: l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001:
- condition: - condition:
ranges: ranges:
# 2 nodes with each node has 4 GPUs # 2 nodes with each node has 4 GPUs
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_perf_sanity:
stage: post_merge stage: post_merge
backend: pytorch backend: pytorch
tests: tests:
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_nodes-r1_fp4_v2_dep8_mtp1] - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1]

View File

@ -0,0 +1,16 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
- condition:
ranges:
# 3 nodes with each node has 4 GPUs
system_gpu_count:
gte: 12
lte: 12
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (180)

View File

@ -0,0 +1,16 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001:
- condition:
ranges:
# 6 nodes with each node has 4 GPUs
system_gpu_count:
gte: 24
lte: 24
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)

View File

@ -0,0 +1,16 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002:
- condition:
ranges:
# 6 nodes with each node has 4 GPUs
system_gpu_count:
gte: 24
lte: 24
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (180)

View File

@ -0,0 +1,16 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
- condition:
ranges:
# 8 nodes with each node has 4 GPUs
system_gpu_count:
gte: 32
lte: 32
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)

View File

@ -4,106 +4,31 @@ Performance sanity testing scripts for TensorRT-LLM with configuration-driven te
## Overview ## Overview
- Run performance sanity benchmarks across multiple model configurations - Run performance sanity benchmarks across multiple model configs
- Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated - Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated
- Manage test cases through YAML configuration files - Manage test cases through YAML config files
- Automated resource calculation and job submission via SLURM - Automated resource calculation and job submission via SLURM
## Configuration File Types ## Configuration File Types
There are three types of YAML configuration files for different deployment architectures: There are three types of YAML config files for different deployment architectures.
Aggregated config files are in [`tests/scripts/perf-sanity`](./).
Disaggregated config files are in [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf).
### 1. Single-Node Aggregated Test Configuration ### 1. Single-Node Aggregated Test Configuration
**File Example**: `l0_dgx_b200.yaml` **File Example**: `deepseek_r1_fp4_v2_grace_blackwell.yaml`
**Use Case**: Single-node performance tests on a single server with multiple GPUs. **Use Case**: Single-node performance tests on a single server with multiple GPUs.
**Structure**:
```yaml
server_configs:
- name: "r1_fp8_dep8_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp8"
gpus: 8
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 8192
attention_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con4096_iter10_1k1k"
concurrency: 4096
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.8
backend: "openai"
```
### 2. Multi-Node Aggregated Test Configuration ### 2. Multi-Node Aggregated Test Configuration
**File Example**: `l0_gb200_multi_nodes.yaml` **File Example**: `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml`
**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution. **Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
**Structure**: ### 3. Multi-Node Disaggregated Test Configuration
```yaml
# Hardware Config
hardware:
gpus_per_node: 4
gpus_per_server: 8
server_configs: **File Example**: `deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml`
- name: "r1_fp4_v2_dep8_mtp1"
model_name: "deepseek_r1_0528_fp4_v2" **Use Case**: Disaggregated architecture where model runs across multiple nodes with separate context (prefill) and generation (decode) servers.
gpus: 8
gpus_per_node: 4
trust_remote_code: true
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 2112
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.5
client_configs:
- name: "con32_iter12_1k1k"
concurrency: 32
iterations: 12
isl: 1024
osl: 1024
random_range_ratio: 0.8
backend: "openai"
```

View File

@ -1,13 +1,13 @@
# Hardware Config metadata:
model_name: deepseek_r1_0528_fp4_v2
supported_gpus:
- GB200
- GB300
hardware: hardware:
gpus_per_node: 4 gpus_per_node: 4
gpus_per_server: 8
server_configs: server_configs:
- name: "r1_fp4_v2_dep8_mtp1" - name: "r1_fp4_v2_dep8_mtp1"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 8
gpus_per_node: 4
trust_remote_code: true trust_remote_code: true
tensor_parallel_size: 8 tensor_parallel_size: 8
moe_expert_parallel_size: 8 moe_expert_parallel_size: 8
@ -37,11 +37,8 @@ server_configs:
osl: 1024 osl: 1024
random_range_ratio: 0.2 random_range_ratio: 0.2
backend: "openai" backend: "openai"
- name: "r1_fp4_v2_tep8_mtp3" - name: "r1_fp4_v2_tep8_mtp3"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 8
gpus_per_node: 4
trust_remote_code: true trust_remote_code: true
tensor_parallel_size: 8 tensor_parallel_size: 8
moe_expert_parallel_size: 8 moe_expert_parallel_size: 8

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
supported_gpus:
- B200
- B300
server_configs:
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter10_1k1k"
concurrency: 2048
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 32
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con32_iter10_1k1k"
concurrency: 32
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
tensor_parallel_size: 4
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 4
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con4_iter10_1k1k"
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"

View File

@ -1,8 +1,12 @@
metadata:
model_name: deepseek_r1_0528_fp4_v2
supported_gpus:
- GB200
- GB300
server_configs: server_configs:
# 1k1k configs # 1k1k configs
- name: "r1_fp4_v2_dep4_mtp1_1k1k" - name: "r1_fp4_v2_dep4_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 4 moe_expert_parallel_size: 4
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -37,7 +41,6 @@ server_configs:
- name: "r1_fp4_v2_tep4_mtp3_1k1k" - name: "r1_fp4_v2_tep4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 4 moe_expert_parallel_size: 4
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -68,7 +71,6 @@ server_configs:
- name: "r1_fp4_v2_tp4_mtp3_1k1k" - name: "r1_fp4_v2_tp4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 1 moe_expert_parallel_size: 1
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -100,7 +102,6 @@ server_configs:
# 8k1k configs # 8k1k configs
- name: "r1_fp4_v2_dep4_mtp1_8k1k" - name: "r1_fp4_v2_dep4_mtp1_8k1k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 4 moe_expert_parallel_size: 4
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -135,7 +136,6 @@ server_configs:
- name: "r1_fp4_v2_tep4_mtp3_8k1k" - name: "r1_fp4_v2_tep4_mtp3_8k1k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 4 moe_expert_parallel_size: 4
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -166,7 +166,6 @@ server_configs:
- name: "r1_fp4_v2_tp4_mtp3_8k1k" - name: "r1_fp4_v2_tp4_mtp3_8k1k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 1 moe_expert_parallel_size: 1
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -198,7 +197,6 @@ server_configs:
# 1k8k configs # 1k8k configs
- name: "r1_fp4_v2_dep4_mtp1_1k8k" - name: "r1_fp4_v2_dep4_mtp1_1k8k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 4 moe_expert_parallel_size: 4
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -233,7 +231,6 @@ server_configs:
- name: "r1_fp4_v2_tep4_mtp3_1k8k" - name: "r1_fp4_v2_tep4_mtp3_1k8k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 4 moe_expert_parallel_size: 4
pipeline_parallel_size: 1 pipeline_parallel_size: 1
@ -264,7 +261,6 @@ server_configs:
- name: "r1_fp4_v2_tp4_mtp3_1k8k" - name: "r1_fp4_v2_tp4_mtp3_1k8k"
model_name: "deepseek_r1_0528_fp4_v2" model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4 tensor_parallel_size: 4
moe_expert_parallel_size: 1 moe_expert_parallel_size: 1
pipeline_parallel_size: 1 pipeline_parallel_size: 1

View File

@ -0,0 +1,99 @@
metadata:
model_name: deepseek_r1_0528_fp8
supported_gpus:
- B200
- B300
server_configs:
- name: "r1_fp8_dep8_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp8"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con4096_iter10_1k1k"
concurrency: 4096
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp8_tep8_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp8"
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 64
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 64
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con64_iter10_1k1k"
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp8_tp8_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp8"
tensor_parallel_size: 8
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 8
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con8_iter10_1k1k"
concurrency: 8
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"

View File

@ -0,0 +1,101 @@
metadata:
model_name: gpt_oss_120b_fp4
supported_gpus:
- B200
- B300
server_configs:
- name: "gpt_oss_fp4_dep2_1k1k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 2
moe_expert_parallel_size: 2
pipeline_parallel_size: 1
max_batch_size: 1024
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
enable_balance: true
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 1024
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
num_postprocess_workers: 4
stream_interval: 20
client_configs:
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "gpt_oss_fp4_dep4_1k1k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
enable_balance: true
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
num_postprocess_workers: 4
stream_interval: 20
client_configs:
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 4
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'Eagle'
eagle3_layers_to_capture: [-1]
max_draft_len: 3
speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
stream_interval: 20
num_postprocess_workers: 4
client_configs:
- name: "con1_iter32_1k1k"
concurrency: 1
iterations: 32
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"

View File

@ -1,293 +0,0 @@
server_configs:
- name: "r1_fp8_dep8_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp8"
gpus: 8
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con4096_iter10_1k1k"
concurrency: 4096
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp8_tep8_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp8"
gpus: 8
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 64
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 64
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con64_iter10_1k1k"
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp8_tp8_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp8"
gpus: 8
tensor_parallel_size: 8
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 8
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con8_iter10_1k1k"
concurrency: 8
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter10_1k1k"
concurrency: 2048
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 32
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con32_iter10_1k1k"
concurrency: 32
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 4
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con4_iter10_1k1k"
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "gpt_oss_fp4_dep2_1k1k"
model_name: "gpt_oss_120b_fp4"
gpus: 2
tensor_parallel_size: 2
moe_expert_parallel_size: 2
pipeline_parallel_size: 1
max_batch_size: 1024
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
enable_balance: true
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 1024
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
num_postprocess_workers: 4
stream_interval: 20
client_configs:
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "gpt_oss_fp4_dep4_1k1k"
model_name: "gpt_oss_120b_fp4"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
enable_balance: true
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
num_postprocess_workers: 4
stream_interval: 20
client_configs:
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
model_name: "gpt_oss_120b_fp4"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 1
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 1
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'Eagle'
eagle3_layers_to_capture: [-1]
max_draft_len: 3
speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
stream_interval: 20
num_postprocess_workers: 4
client_configs:
- name: "con1_iter32_1k1k"
concurrency: 1
iterations: 32
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"

View File

@ -1,194 +0,0 @@
server_configs:
- name: "r1_fp8_dep8_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp8"
gpus: 8
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con4096_iter10_1k1k"
concurrency: 4096
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp8_tep8_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp8"
gpus: 8
tensor_parallel_size: 8
moe_expert_parallel_size: 8
pipeline_parallel_size: 1
max_batch_size: 64
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'DEEPGEMM'
cuda_graph_config:
enable_padding: true
max_batch_size: 64
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con64_iter10_1k1k"
concurrency: 64
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp8_tp8_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp8"
gpus: 8
tensor_parallel_size: 8
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 8
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con8_iter10_1k1k"
concurrency: 8
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 512
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60
moe_config:
backend: 'CUTLASS'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter10_1k1k"
concurrency: 2048
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 32
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 32
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con32_iter10_1k1k"
concurrency: 32
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
model_name: "deepseek_r1_0528_fp4_v2"
gpus: 4
tensor_parallel_size: 4
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 4
max_num_tokens: 8192
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 4
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
speculative_config:
decoding_type: 'MTP'
num_nextn_predict_layers: 3
client_configs:
- name: "con4_iter10_1k1k"
concurrency: 4
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
backend: "openai"