mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-8952][feat] Support Multi-Node Disagg Perf Test in CI (#9138)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
parent
684b37df02
commit
d70aeddc7f
@ -748,9 +748,9 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
|||||||
}
|
}
|
||||||
// End of Methods to run Slurm job with Jenkins Agent
|
// End of Methods to run Slurm job with Jenkins Agent
|
||||||
|
|
||||||
def getNodeArgs(int nodeCount, int gpuCount) {
|
def getNodeArgs(int nodeCount, int gpuCount, boolean setSegment = false) {
|
||||||
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
|
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
|
||||||
return nodeCount == 1 ? [
|
def args = nodeCount == 1 ? [
|
||||||
"--nodes=${nodeCount}",
|
"--nodes=${nodeCount}",
|
||||||
"--gpus=${gpuCount}"
|
"--gpus=${gpuCount}"
|
||||||
] : [
|
] : [
|
||||||
@ -759,6 +759,10 @@ def getNodeArgs(int nodeCount, int gpuCount) {
|
|||||||
"--ntasks-per-node=${gpusPerNode}",
|
"--ntasks-per-node=${gpusPerNode}",
|
||||||
"--gpus-per-node=${gpusPerNode}",
|
"--gpus-per-node=${gpusPerNode}",
|
||||||
]
|
]
|
||||||
|
if (setSegment && gpuCount > 1) {
|
||||||
|
args += ["--segment=${nodeCount}"]
|
||||||
|
}
|
||||||
|
return args
|
||||||
}
|
}
|
||||||
|
|
||||||
def getPytestBaseCommandLine(
|
def getPytestBaseCommandLine(
|
||||||
@ -883,6 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
|||||||
// Create a unique suffix for the job name
|
// Create a unique suffix for the job name
|
||||||
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
|
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
|
||||||
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
|
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
|
||||||
|
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
|
||||||
|
def setSegment = disaggMode
|
||||||
|
|
||||||
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
|
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
|
||||||
|
|
||||||
@ -914,6 +920,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
|||||||
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
|
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
|
||||||
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
|
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
|
||||||
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
|
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
|
||||||
|
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
|
||||||
|
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
|
||||||
def testListPathNode = "${jobWorkspace}/${testList}.txt"
|
def testListPathNode = "${jobWorkspace}/${testList}.txt"
|
||||||
def waivesListPathNode = "${jobWorkspace}/waives.txt"
|
def waivesListPathNode = "${jobWorkspace}/waives.txt"
|
||||||
def outputPath = "${jobWorkspace}/job-output.log"
|
def outputPath = "${jobWorkspace}/job-output.log"
|
||||||
@ -940,6 +948,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
|||||||
true
|
true
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
|
||||||
|
Utils.copyFileToRemoteHost(
|
||||||
|
pipeline,
|
||||||
|
remote,
|
||||||
|
scriptInstallLocalPath,
|
||||||
|
scriptInstallPathNode,
|
||||||
|
true
|
||||||
|
)
|
||||||
|
|
||||||
// Generate Test List and Upload to Frontend Node
|
// Generate Test List and Upload to Frontend Node
|
||||||
def makoArgs = getMakoArgsFromStageName(stageName, true)
|
def makoArgs = getMakoArgsFromStageName(stageName, true)
|
||||||
// TODO: currently the options will only be processed if the first
|
// TODO: currently the options will only be processed if the first
|
||||||
@ -1013,7 +1030,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
|||||||
// Generate Job Launch Script
|
// Generate Job Launch Script
|
||||||
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
|
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
|
||||||
def mounts = getMountListForSlurmTest(cluster, true).join(",")
|
def mounts = getMountListForSlurmTest(cluster, true).join(",")
|
||||||
String[] taskArgs = getNodeArgs(nodeCount, gpuCount)
|
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
|
||||||
if (taskArgs == null) {
|
if (taskArgs == null) {
|
||||||
error "Invalid Slurm test stage name is set"
|
error "Invalid Slurm test stage name is set"
|
||||||
}
|
}
|
||||||
@ -1083,10 +1100,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
|||||||
envVarsToExport.each { varName, varValue ->
|
envVarsToExport.each { varName, varValue ->
|
||||||
srunArgs.add("--container-env=${varName}")
|
srunArgs.add("--container-env=${varName}")
|
||||||
}
|
}
|
||||||
if(nodeCount > 1) {
|
|
||||||
srunArgs.add("--mpi=pmi2")
|
|
||||||
}
|
|
||||||
|
|
||||||
def exemptionComment = ""
|
def exemptionComment = ""
|
||||||
if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
|
if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
|
||||||
exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
|
exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
|
||||||
@ -1102,8 +1115,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
|||||||
"export ${varName}=\"${escapedValue}\""
|
"export ${varName}=\"${escapedValue}\""
|
||||||
}.join('\n')
|
}.join('\n')
|
||||||
|
|
||||||
def scriptContent = """#!/bin/bash
|
def scriptLaunchPrefix = """#!/bin/bash
|
||||||
#SBATCH ${exemptionComment} --output=${outputPath}
|
#SBATCH ${exemptionComment}
|
||||||
|
#SBATCH --output=${outputPath}
|
||||||
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
|
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
|
||||||
#SBATCH ${partition.additionalArgs}
|
#SBATCH ${partition.additionalArgs}
|
||||||
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
|
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
|
||||||
@ -1128,10 +1142,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
|||||||
echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"
|
echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"
|
||||||
|
|
||||||
${srunPrologue}
|
${srunPrologue}
|
||||||
|
|
||||||
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
|
|
||||||
""".replaceAll("(?m)^\\s*", "")
|
""".replaceAll("(?m)^\\s*", "")
|
||||||
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
|
|
||||||
|
if (disaggMode) {
|
||||||
|
if(nodeCount > 1) {
|
||||||
|
srunArgs.add("--mpi=pmix")
|
||||||
|
}
|
||||||
|
|
||||||
|
def scriptLaunchPrefixPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch_prefix.sh")
|
||||||
|
def scriptLaunchSrunArgsPathLocal = Utils.createTempLocation(pipeline, "./slurm_srun_args.txt")
|
||||||
|
def scriptLaunchDraftPathLocal = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh"
|
||||||
|
def scriptSubmitLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/submit.py"
|
||||||
|
|
||||||
|
pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
|
||||||
|
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
|
||||||
|
Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
|
||||||
|
Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")
|
||||||
|
|
||||||
|
// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
|
||||||
|
sh """
|
||||||
|
python3 ${scriptSubmitLocalPath} \\
|
||||||
|
--run-ci \\
|
||||||
|
--llm-src ${llmSrcLocal} \\
|
||||||
|
--test-list ${testListPathLocal} \\
|
||||||
|
--draft-launch-sh ${scriptLaunchDraftPathLocal} \\
|
||||||
|
--launch-sh ${scriptLaunchPathLocal} \\
|
||||||
|
--run-sh ${scriptRunPathNode} \\
|
||||||
|
--install-sh ${scriptInstallPathNode} \\
|
||||||
|
--script-prefix ${scriptLaunchPrefixPathLocal} \\
|
||||||
|
--srun-args ${scriptLaunchSrunArgsPathLocal}
|
||||||
|
"""
|
||||||
|
} else {
|
||||||
|
if(nodeCount > 1) {
|
||||||
|
srunArgs.add("--mpi=pmi2")
|
||||||
|
}
|
||||||
|
|
||||||
|
def scriptContent = """
|
||||||
|
${scriptLaunchPrefix}
|
||||||
|
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
|
||||||
|
""".replaceAll("(?m)^\\s*", "")
|
||||||
|
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
|
||||||
|
}
|
||||||
|
|
||||||
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
|
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
|
||||||
Utils.copyFileToRemoteHost(
|
Utils.copyFileToRemoteHost(
|
||||||
pipeline,
|
pipeline,
|
||||||
@ -2634,7 +2686,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
|||||||
if (noRegularTests && noIsolateTests) {
|
if (noRegularTests && noIsolateTests) {
|
||||||
error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result."
|
error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result."
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2653,7 +2704,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
|||||||
stage("Check perf result") {
|
stage("Check perf result") {
|
||||||
def perfCheckResult = sh(
|
def perfCheckResult = sh(
|
||||||
script: """
|
script: """
|
||||||
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
|
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
|
||||||
${stageName}/perf_script_test_results.csv \
|
${stageName}/perf_script_test_results.csv \
|
||||||
${basePerfPath}
|
${basePerfPath}
|
||||||
""",
|
""",
|
||||||
@ -2672,6 +2723,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
|||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (perfMode && stageName.contains("Perf-Sanity")) {
|
||||||
|
stage ("Check perf result") {
|
||||||
|
def perfCheckResult = sh(
|
||||||
|
script: """
|
||||||
|
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
|
||||||
|
${WORKSPACE}/${stageName}
|
||||||
|
""",
|
||||||
|
returnStatus: true
|
||||||
|
)
|
||||||
|
// TODO: Enable this when perf regression check is stable
|
||||||
|
// if (perfCheckResult != 0) {
|
||||||
|
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3111,8 +3178,13 @@ def launchTestJobs(pipeline, testFilter)
|
|||||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
|
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
|
||||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
|
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
|
||||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
|
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
|
||||||
// Perf sanity post merge test
|
// Perf sanity post merge aggr tests
|
||||||
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2],
|
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
|
||||||
|
// Perf sanity post merge disagg tests
|
||||||
|
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
|
||||||
|
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
|
||||||
|
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
|
||||||
|
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
|
||||||
]
|
]
|
||||||
fullSet += multiNodesSBSAConfigs.keySet()
|
fullSet += multiNodesSBSAConfigs.keySet()
|
||||||
|
|
||||||
|
|||||||
76
jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
Normal file
76
jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
|
||||||
|
cleanup_on_failure() {
|
||||||
|
echo "Error: $1"
|
||||||
|
scancel ${SLURM_JOB_ID}
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p $jobWorkspace
|
||||||
|
chmod +x $runScript
|
||||||
|
chmod +x $installScript
|
||||||
|
|
||||||
|
# Run installation on all nodes
|
||||||
|
echo "Running installation on all nodes..."
|
||||||
|
if ! srun "${srunArgs[@]}" $installScript &> $jobWorkspace/install.log; then
|
||||||
|
cleanup_on_failure "Failed to run installation. Check $jobWorkspace/install.log"
|
||||||
|
fi
|
||||||
|
echo "Installation completed on all nodes"
|
||||||
|
|
||||||
|
# Start gen servers
|
||||||
|
echo "Starting gen servers..."
|
||||||
|
for i in $(seq 0 $((numGenServers - 1))); do
|
||||||
|
gen_world_size=$((nodesPerGenServer * gpusPerNode))
|
||||||
|
export DISAGG_SERVING_TYPE="GEN_$i"
|
||||||
|
export pytestCommand="$pytestCommandWorker"
|
||||||
|
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
|
||||||
|
-N $nodesPerGenServer \
|
||||||
|
--ntasks=$gen_world_size \
|
||||||
|
--ntasks-per-node=$gpusPerNode \
|
||||||
|
$runScript &> $jobWorkspace/gen_server_$i.log &
|
||||||
|
echo "Started gen server $i"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Start ctx servers (skip if gen_only mode)
|
||||||
|
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
|
||||||
|
echo "Starting ctx servers..."
|
||||||
|
for i in $(seq 0 $((numCtxServers - 1))); do
|
||||||
|
ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
|
||||||
|
export DISAGG_SERVING_TYPE="CTX_$i"
|
||||||
|
export pytestCommand="$pytestCommandWorker"
|
||||||
|
srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
|
||||||
|
-N $nodesPerCtxServer \
|
||||||
|
--ntasks=$ctx_world_size \
|
||||||
|
--ntasks-per-node=$gpusPerNode \
|
||||||
|
$runScript &> $jobWorkspace/ctx_server_$i.log &
|
||||||
|
echo "Started ctx server $i"
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "Skipping ctx servers (gen_only mode)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# Start disagg server
|
||||||
|
echo "Starting disagg server..."
|
||||||
|
export DISAGG_SERVING_TYPE="DISAGG_SERVER"
|
||||||
|
export pytestCommand="$pytestCommandDisaggServer"
|
||||||
|
srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
|
||||||
|
-N 1 \
|
||||||
|
--ntasks=1 \
|
||||||
|
--ntasks-per-node=1 \
|
||||||
|
$runScript &> $jobWorkspace/disagg_server.log &
|
||||||
|
echo "Started disagg server"
|
||||||
|
|
||||||
|
# Start benchmark
|
||||||
|
echo "Starting benchmark..."
|
||||||
|
export DISAGG_SERVING_TYPE="BENCHMARK"
|
||||||
|
export pytestCommand="$pytestCommandBenchmark"
|
||||||
|
if ! srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
|
||||||
|
-N 1 \
|
||||||
|
--ntasks=1 \
|
||||||
|
--ntasks-per-node=1 \
|
||||||
|
$runScript; then
|
||||||
|
cleanup_on_failure "Benchmark failed. Check logs in ${jobWorkspace} for details"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Disagg server and benchmark completed successfully"
|
||||||
|
echo "Total runtime: $SECONDS seconds"
|
||||||
292
jenkins/scripts/perf/disaggregated/submit.py
Normal file
292
jenkins/scripts/perf/disaggregated/submit.py
Normal file
@ -0,0 +1,292 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
def get_hardware_config(config, benchmark_mode):
|
||||||
|
hardware = config.get("hardware", {})
|
||||||
|
worker_config = config.get("worker_config", {})
|
||||||
|
|
||||||
|
num_ctx_servers = 0 if "gen_only" in benchmark_mode else hardware.get("num_ctx_servers")
|
||||||
|
num_gen_servers = hardware.get("num_gen_servers")
|
||||||
|
gpus_per_node = hardware.get("gpus_per_node")
|
||||||
|
|
||||||
|
# Get gpus_per_ctx_server and gpus_per_gen_server from worker_config's tensor_parallel_size
|
||||||
|
ctx_config = worker_config.get("ctx", {})
|
||||||
|
gen_config = worker_config.get("gen", {})
|
||||||
|
ctx_tp = ctx_config.get("tensor_parallel_size", 1)
|
||||||
|
ctx_pp = ctx_config.get("pipeline_parallel_size", 1)
|
||||||
|
ctx_cp = ctx_config.get("context_parallel_size", 1)
|
||||||
|
gpus_per_ctx_server = ctx_tp * ctx_pp * ctx_cp
|
||||||
|
gen_tp = gen_config.get("tensor_parallel_size", 1)
|
||||||
|
gen_pp = gen_config.get("pipeline_parallel_size", 1)
|
||||||
|
gen_cp = gen_config.get("context_parallel_size", 1)
|
||||||
|
gpus_per_gen_server = gen_tp * gen_pp * gen_cp
|
||||||
|
|
||||||
|
if None in [
|
||||||
|
num_ctx_servers,
|
||||||
|
num_gen_servers,
|
||||||
|
gpus_per_node,
|
||||||
|
gpus_per_ctx_server,
|
||||||
|
gpus_per_gen_server,
|
||||||
|
]:
|
||||||
|
raise ValueError("Missing required hardware configuration")
|
||||||
|
|
||||||
|
# Calculate nodes per server
|
||||||
|
nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node
|
||||||
|
nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node
|
||||||
|
|
||||||
|
total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server
|
||||||
|
total_gpus = total_nodes * gpus_per_node
|
||||||
|
|
||||||
|
return {
|
||||||
|
"num_ctx_servers": num_ctx_servers,
|
||||||
|
"num_gen_servers": num_gen_servers,
|
||||||
|
"gpus_per_node": gpus_per_node,
|
||||||
|
"gpus_per_ctx_server": gpus_per_ctx_server,
|
||||||
|
"gpus_per_gen_server": gpus_per_gen_server,
|
||||||
|
"nodes_per_ctx_server": nodes_per_ctx_server,
|
||||||
|
"nodes_per_gen_server": nodes_per_gen_server,
|
||||||
|
"total_nodes": total_nodes,
|
||||||
|
"total_gpus": total_gpus,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_env_config(config):
|
||||||
|
env = config.get("environment", {})
|
||||||
|
|
||||||
|
container = env.get("container_image", "")
|
||||||
|
mounts = env.get("container_mount", "")
|
||||||
|
workdir = env.get("container_workdir", "")
|
||||||
|
llm_models_root = env.get("llm_models_root", "")
|
||||||
|
llmsrc = env.get("trtllm_repo", "")
|
||||||
|
build_wheel = env.get("build_wheel", False)
|
||||||
|
# Use work_dir as job_workspace
|
||||||
|
job_workspace = env.get("work_dir", "")
|
||||||
|
worker_env_var = env.get("worker_env_var", "")
|
||||||
|
server_env_var = env.get("server_env_var", "")
|
||||||
|
benchmark_env_var = env.get("benchmark_env_var", "")
|
||||||
|
open_search_db_base_url = env.get("open_search_db_base_url", "")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"container": container,
|
||||||
|
"mounts": mounts,
|
||||||
|
"workdir": workdir,
|
||||||
|
"llm_models_root": llm_models_root,
|
||||||
|
"llmsrc": llmsrc,
|
||||||
|
"build_wheel": build_wheel,
|
||||||
|
"job_workspace": job_workspace,
|
||||||
|
"worker_env_var": worker_env_var,
|
||||||
|
"server_env_var": server_env_var,
|
||||||
|
"benchmark_env_var": benchmark_env_var,
|
||||||
|
"open_search_db_base_url": open_search_db_base_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_benchmark_config(config):
|
||||||
|
benchmark = config.get("benchmark", {})
|
||||||
|
|
||||||
|
mode = benchmark.get("mode", "e2e")
|
||||||
|
concurrency_str = benchmark.get("concurrency_list", "1")
|
||||||
|
concurrency = int(concurrency_str) if isinstance(concurrency_str, str) else concurrency_str
|
||||||
|
|
||||||
|
return {
|
||||||
|
"mode": mode,
|
||||||
|
"concurrency": concurrency,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def remove_whitespace_lines(lines):
|
||||||
|
return [line.strip() for line in lines if line.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def get_pytest_command_no_llmapilaunch(script_prefix_lines):
|
||||||
|
pytest_command_line = None
|
||||||
|
for line in script_prefix_lines:
|
||||||
|
if "export pytestCommand=" in line:
|
||||||
|
pytest_command_line = line
|
||||||
|
break
|
||||||
|
|
||||||
|
if not pytest_command_line:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Replace pytestCommand with pytestCommandNoLLMAPILaunch
|
||||||
|
replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch")
|
||||||
|
|
||||||
|
# Split by space, find and remove the substring with trtllm-llmapi-launch
|
||||||
|
replaced_line_parts = replaced_line.split()
|
||||||
|
replaced_line_parts_no_llmapi = [
|
||||||
|
part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part
|
||||||
|
]
|
||||||
|
return " ".join(replaced_line_parts_no_llmapi)
|
||||||
|
|
||||||
|
|
||||||
|
def get_config_yaml(test_list_path, llm_src):
|
||||||
|
with open(test_list_path, "r") as f:
|
||||||
|
first_line = f.readline().strip()
|
||||||
|
|
||||||
|
if "[" not in first_line or "]" not in first_line:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid test list format. Expected test name with brackets: {first_line}"
|
||||||
|
)
|
||||||
|
bracket_content = first_line.split("[")[-1].split("]")[0]
|
||||||
|
parts = bracket_content.split("-")
|
||||||
|
if len(parts) < 2:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid test name format. Expected format: prefix-config_name, got: {bracket_content}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# parts[0] is the prefix, parts[1:] is the config name
|
||||||
|
if "disagg" not in parts[0]:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid test name format. Expected format: disagg-config_name, got: {bracket_content}"
|
||||||
|
)
|
||||||
|
config_base_name = "-".join(parts[1:])
|
||||||
|
config_yaml_path = os.path.join(
|
||||||
|
llm_src,
|
||||||
|
"tests",
|
||||||
|
"integration",
|
||||||
|
"defs",
|
||||||
|
"perf",
|
||||||
|
"disagg",
|
||||||
|
"test_configs",
|
||||||
|
"disagg",
|
||||||
|
"perf",
|
||||||
|
f"{config_base_name}.yaml",
|
||||||
|
)
|
||||||
|
if not os.path.exists(config_yaml_path):
|
||||||
|
raise FileNotFoundError(f"Config file not found: {config_yaml_path}")
|
||||||
|
return config_yaml_path
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Generate SLURM launch script for both CI and local modes"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--run-ci",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Run in CI mode (true) or local mode (false)",
|
||||||
|
)
|
||||||
|
parser.add_argument("--draft-launch-sh", required=True, help="Path to draft-launch.sh script")
|
||||||
|
parser.add_argument("--launch-sh", required=True, help="Path to output launch.sh script")
|
||||||
|
parser.add_argument("--run-sh", required=True, help="Path to slurm_run.sh script")
|
||||||
|
parser.add_argument("--install-sh", required=True, help="Path to slurm_install.sh script")
|
||||||
|
|
||||||
|
# Optional arguments for local mode
|
||||||
|
parser.add_argument("--config-yaml", default="", help="Path to config YAML file")
|
||||||
|
parser.add_argument("--stage-name", default="", help="Stage name (optional, local mode only)")
|
||||||
|
|
||||||
|
# Optional arguments for CI mode
|
||||||
|
parser.add_argument("--llm-src", default="", help="Path to LLM source code")
|
||||||
|
parser.add_argument("--test-list", default="", help="Path to test list file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--script-prefix",
|
||||||
|
default="",
|
||||||
|
help="Launch script prefix file path (optional, CI mode only)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--srun-args",
|
||||||
|
default="",
|
||||||
|
help="Path to file containing srun args (optional, CI mode only)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config_yaml = get_config_yaml(args.test_list, args.llm_src)
|
||||||
|
|
||||||
|
with open(config_yaml, "r") as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Determine install script path
|
||||||
|
install_script = args.install_sh
|
||||||
|
|
||||||
|
env_config = get_env_config(config)
|
||||||
|
print(f"Environment configuration: {env_config}")
|
||||||
|
|
||||||
|
benchmark_config = get_benchmark_config(config)
|
||||||
|
print(f"Benchmark configuration: {benchmark_config}")
|
||||||
|
benchmark_mode = benchmark_config["mode"]
|
||||||
|
|
||||||
|
hardware_config = get_hardware_config(config, benchmark_mode)
|
||||||
|
print(f"Hardware configuration: {hardware_config}")
|
||||||
|
|
||||||
|
script_prefix_lines = []
|
||||||
|
srun_args_lines = []
|
||||||
|
|
||||||
|
with open(args.script_prefix, "r") as f:
|
||||||
|
script_prefix_content = f.read()
|
||||||
|
script_prefix_lines = script_prefix_content.split("\n")
|
||||||
|
with open(args.srun_args, "r") as f:
|
||||||
|
srun_args_content = f.read()
|
||||||
|
|
||||||
|
srun_args_lines = srun_args_content.split()
|
||||||
|
|
||||||
|
# Extract pytestCommand and generate pytestCommandNoLLMAPILaunch
|
||||||
|
pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines)
|
||||||
|
|
||||||
|
# Build worker env vars, add extra env vars for gen_only mode
|
||||||
|
worker_env_vars = env_config["worker_env_var"]
|
||||||
|
server_env_vars = env_config["server_env_var"]
|
||||||
|
if "gen_only" in benchmark_config["mode"]:
|
||||||
|
concurrency = benchmark_config["concurrency"]
|
||||||
|
worker_env_vars = (
|
||||||
|
"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 "
|
||||||
|
f"TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1 "
|
||||||
|
f"TLLM_BENCHMARK_REQ_QUEUES_SIZE={concurrency} {worker_env_vars}"
|
||||||
|
)
|
||||||
|
server_env_vars = f"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 {server_env_vars}"
|
||||||
|
script_prefix_lines.append("export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1")
|
||||||
|
srun_args_lines.append("--container-env=TRTLLM_DISAGG_BENCHMARK_GEN_ONLY")
|
||||||
|
|
||||||
|
script_prefix_lines.extend(
|
||||||
|
[
|
||||||
|
pytest_command_no_llmapi_launch,
|
||||||
|
f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"',
|
||||||
|
f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"',
|
||||||
|
f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"',
|
||||||
|
f"export runScript={args.run_sh}",
|
||||||
|
f"export installScript={install_script}",
|
||||||
|
f"export numCtxServers={hardware_config['num_ctx_servers']}",
|
||||||
|
f"export numGenServers={hardware_config['num_gen_servers']}",
|
||||||
|
f"export gpusPerNode={hardware_config['gpus_per_node']}",
|
||||||
|
f"export gpusPerCtxServer={hardware_config['gpus_per_ctx_server']}",
|
||||||
|
f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}",
|
||||||
|
f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}",
|
||||||
|
f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}",
|
||||||
|
f"export totalNodes={hardware_config['total_nodes']}",
|
||||||
|
f"export totalGpus={hardware_config['total_gpus']}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
remove_whitespace_lines(script_prefix_lines)
|
||||||
|
script_prefix = "\n".join(script_prefix_lines)
|
||||||
|
|
||||||
|
remove_whitespace_lines(srun_args_lines)
|
||||||
|
srun_args_lines.extend(
|
||||||
|
[
|
||||||
|
"--container-env=DISAGG_SERVING_TYPE",
|
||||||
|
"--container-env=pytestCommand",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
srun_args_lines = ["srunArgs=("] + [f' "{line}"' for line in srun_args_lines] + [")"]
|
||||||
|
srun_args = "\n".join(srun_args_lines)
|
||||||
|
|
||||||
|
with open(args.draft_launch_sh, "r") as f:
|
||||||
|
draft_launch_content = f.read()
|
||||||
|
draft_launch_lines = draft_launch_content.split("\n")
|
||||||
|
remove_whitespace_lines(draft_launch_lines)
|
||||||
|
draft_launch_content = "\n".join(draft_launch_lines)
|
||||||
|
|
||||||
|
with open(args.launch_sh, "w") as f:
|
||||||
|
f.write(f"{script_prefix}\n{srun_args}\n{draft_launch_content}")
|
||||||
|
|
||||||
|
print(f"Launch script generated at: {args.launch_sh}")
|
||||||
|
print(f"Launch script:\n{script_prefix}\n{srun_args}\n{draft_launch_content}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
37
jenkins/scripts/slurm_install.sh
Normal file
37
jenkins/scripts/slurm_install.sh
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Set up error handling
|
||||||
|
set -Eeuo pipefail
|
||||||
|
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
|
||||||
|
|
||||||
|
slurm_install_setup() {
|
||||||
|
cd $resourcePathNode
|
||||||
|
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
|
||||||
|
|
||||||
|
if [ $SLURM_LOCALID -eq 0 ]; then
|
||||||
|
wget -nv $llmTarfile
|
||||||
|
tar -zxf $tarName
|
||||||
|
which python3
|
||||||
|
python3 --version
|
||||||
|
apt-get install -y libffi-dev
|
||||||
|
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
|
||||||
|
if [[ $pytestCommand == *--run-ray* ]]; then
|
||||||
|
pip3 install --retries 10 ray[default]
|
||||||
|
fi
|
||||||
|
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
|
||||||
|
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
|
||||||
|
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
|
||||||
|
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
|
||||||
|
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
|
||||||
|
touch install_lock.lock
|
||||||
|
else
|
||||||
|
while [ ! -f install_lock.lock ]; do
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Only run slurm_install_setup when script is executed directly (not sourced)
|
||||||
|
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
|
||||||
|
slurm_install_setup
|
||||||
|
fi
|
||||||
@ -39,26 +39,12 @@ if [ $SLURM_PROCID -eq 0 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $SLURM_LOCALID -eq 0 ]; then
|
# Aggregated mode will run install together with pytest in slurm_run.sh
|
||||||
wget -nv $llmTarfile
|
# Disaggregated mode will run install separately in slurm_install.sh
|
||||||
tar -zxf $tarName
|
if [[ "$stageName" != *Disagg* ]]; then
|
||||||
which python3
|
installScriptPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_run\.sh/slurm_install.sh/')"
|
||||||
python3 --version
|
source "$installScriptPath"
|
||||||
apt-get install -y libffi-dev
|
slurm_install_setup
|
||||||
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
|
|
||||||
if [[ $pytestCommand == *--run-ray* ]]; then
|
|
||||||
pip3 install --retries 10 ray[default]
|
|
||||||
fi
|
|
||||||
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
|
|
||||||
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
|
|
||||||
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
|
|
||||||
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
|
|
||||||
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
|
|
||||||
touch install_lock.lock
|
|
||||||
else
|
|
||||||
while [ ! -f install_lock.lock ]; do
|
|
||||||
sleep 5
|
|
||||||
done
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$stageName" == *GB200* ]]; then
|
if [[ "$stageName" == *GB200* ]]; then
|
||||||
@ -131,3 +117,9 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
|
|||||||
--files $stageName/perf_script_test_results.csv \
|
--files $stageName/perf_script_test_results.csv \
|
||||||
$basePerfPath
|
$basePerfPath
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
|
||||||
|
echo "Check Perf-Sanity Result"
|
||||||
|
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
|
||||||
|
$jobWorkspace
|
||||||
|
fi
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -0,0 +1,105 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 1k1k
|
||||||
|
config_index: -1
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: "--gres=gpu:4"
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: e2e
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 8
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '6144'
|
||||||
|
input_length: 1024
|
||||||
|
output_length: 1024
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||||
|
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
model: local-completions
|
||||||
|
tasks: gsm8k
|
||||||
|
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
enable_attention_dp: true
|
||||||
|
enable_lm_head_tp_in_adp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
context_parallel_size: 1
|
||||||
|
max_batch_size: 768
|
||||||
|
max_num_tokens: 768
|
||||||
|
max_seq_len: 2068
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 32
|
||||||
|
- 64
|
||||||
|
- 128
|
||||||
|
- 256
|
||||||
|
- 512
|
||||||
|
- 768
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: CUTLASS
|
||||||
|
use_low_precision_moe_combine: true
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 16384
|
||||||
|
backend: UCX
|
||||||
|
stream_interval: 100
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 16
|
||||||
|
max_num_tokens: 16896
|
||||||
|
max_seq_len: 2044
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
context_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.75
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 16384
|
||||||
|
backend: UCX
|
||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -0,0 +1,122 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
|
precision: fp4
|
||||||
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
- GB300
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
benchmark_type: 1k1k
|
||||||
|
config_index: -1
|
||||||
|
slurm:
|
||||||
|
script_file: disaggr_torch.slurm
|
||||||
|
partition: <partition>
|
||||||
|
account: <account>
|
||||||
|
job_time: 02:00:00
|
||||||
|
job_name: unified-benchmark
|
||||||
|
extra_args: "--gres=gpu:4"
|
||||||
|
numa_bind: true
|
||||||
|
benchmark:
|
||||||
|
mode: gen_only
|
||||||
|
use_nv_sa_benchmark: true
|
||||||
|
multi_round: 1
|
||||||
|
benchmark_ratio: 0.8
|
||||||
|
streaming: true
|
||||||
|
concurrency_list: '1024'
|
||||||
|
input_length: 8192
|
||||||
|
output_length: 1024
|
||||||
|
dataset_file: <dataset_file>
|
||||||
|
hardware:
|
||||||
|
gpus_per_node: 4
|
||||||
|
num_ctx_servers: 1
|
||||||
|
num_gen_servers: 1
|
||||||
|
environment:
|
||||||
|
container_mount: <container_mount>
|
||||||
|
container_image: <container_image>
|
||||||
|
model_path: <model_path>
|
||||||
|
trtllm_repo: ''
|
||||||
|
build_wheel: false
|
||||||
|
work_dir: <full_path_to_work_dir>
|
||||||
|
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
|
||||||
|
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
|
||||||
|
profiling:
|
||||||
|
nsys_on: false
|
||||||
|
accuracy:
|
||||||
|
enable_accuracy_test: false
|
||||||
|
model: local-completions
|
||||||
|
tasks: gsm8k
|
||||||
|
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||||
|
worker_config:
|
||||||
|
gen:
|
||||||
|
tensor_parallel_size: 32
|
||||||
|
moe_expert_parallel_size: 32
|
||||||
|
context_parallel_size: 1
|
||||||
|
enable_attention_dp: true
|
||||||
|
enable_lm_head_tp_in_adp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 128
|
||||||
|
max_num_tokens: 512
|
||||||
|
max_seq_len: 9256
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
batch_sizes:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 8
|
||||||
|
- 16
|
||||||
|
- 24
|
||||||
|
- 32
|
||||||
|
- 40
|
||||||
|
- 48
|
||||||
|
- 56
|
||||||
|
- 64
|
||||||
|
- 72
|
||||||
|
- 80
|
||||||
|
- 88
|
||||||
|
- 96
|
||||||
|
- 104
|
||||||
|
- 112
|
||||||
|
- 120
|
||||||
|
- 128
|
||||||
|
print_iter_log: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.75
|
||||||
|
dtype: fp8
|
||||||
|
moe_config:
|
||||||
|
backend: CUTEDSL
|
||||||
|
use_low_precision_moe_combine: true
|
||||||
|
load_balancer:
|
||||||
|
num_slots: 288
|
||||||
|
layer_updates_per_iter: 1
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 16384
|
||||||
|
backend: UCX
|
||||||
|
stream_interval: 100
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
ctx:
|
||||||
|
max_batch_size: 2
|
||||||
|
max_num_tokens: 16896
|
||||||
|
max_seq_len: 9256
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
context_parallel_size: 1
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
enable_attention_dp: true
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
print_iter_log: true
|
||||||
|
cuda_graph_config: null
|
||||||
|
disable_overlap_scheduler: true
|
||||||
|
kv_cache_config:
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.75
|
||||||
|
dtype: fp8
|
||||||
|
cache_transceiver_config:
|
||||||
|
max_tokens_in_buffer: 16384
|
||||||
|
backend: UCX
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: MTP
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
metadata:
|
metadata:
|
||||||
model_name: deepseek-r1-fp4
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
precision: fp4
|
precision: fp4
|
||||||
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
model_dir_name: DeepSeek-R1-0528-FP4-v2
|
||||||
supported_gpus:
|
supported_gpus:
|
||||||
|
|||||||
@ -33,6 +33,8 @@ from jenkins.scripts.open_search_db import OpenSearchDB
|
|||||||
|
|
||||||
PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf"
|
PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf"
|
||||||
TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
|
TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
|
||||||
|
PRE_MERGE_THRESHOLD = 0.1
|
||||||
|
POST_MERGE_THRESHOLD = 0.05
|
||||||
|
|
||||||
# Metrics where larger is better
|
# Metrics where larger is better
|
||||||
MAXIMIZE_METRICS = [
|
MAXIMIZE_METRICS = [
|
||||||
@ -268,24 +270,7 @@ def match(history_data, new_data, match_keys):
|
|||||||
def is_empty(value):
|
def is_empty(value):
|
||||||
return value is None or value == ""
|
return value is None or value == ""
|
||||||
|
|
||||||
def should_skip_field(field):
|
|
||||||
# Skip fields starting with @, _, ts_
|
|
||||||
if field.startswith('@') or field.startswith('_') or field.startswith(
|
|
||||||
'ts_'):
|
|
||||||
return True
|
|
||||||
# Skip log links and speculative_model_dir and job configs
|
|
||||||
if field in [
|
|
||||||
's_speculative_model_dir', 's_server_log_link',
|
|
||||||
's_ctx_server_log_link', 's_gen_server_log_link',
|
|
||||||
's_client_log_link'
|
|
||||||
]:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
for field in match_keys:
|
for field in match_keys:
|
||||||
# Skip excluded fields
|
|
||||||
if should_skip_field(field):
|
|
||||||
continue
|
|
||||||
history_value = history_data.get(field, None)
|
history_value = history_data.get(field, None)
|
||||||
new_value = new_data.get(field, None)
|
new_value = new_data.get(field, None)
|
||||||
if is_empty(history_value) and is_empty(new_value):
|
if is_empty(history_value) and is_empty(new_value):
|
||||||
@ -412,6 +397,33 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
|
|||||||
return history_baseline_dict, history_data_dict
|
return history_baseline_dict, history_data_dict
|
||||||
|
|
||||||
|
|
||||||
|
def get_threshold(baseline_data, metric):
|
||||||
|
"""
|
||||||
|
Get the threshold for a metric from baseline data.
|
||||||
|
"""
|
||||||
|
is_post_merge = baseline_data.get("b_is_post_merge", False)
|
||||||
|
|
||||||
|
metric_suffix = metric[2:] # Remove "d_" prefix
|
||||||
|
if is_post_merge:
|
||||||
|
threshold_key = f"d_threshold_post_merge_{metric_suffix}"
|
||||||
|
else:
|
||||||
|
threshold_key = f"d_threshold_pre_merge_{metric_suffix}"
|
||||||
|
|
||||||
|
# Try to get the specific threshold (post_merge or pre_merge)
|
||||||
|
if threshold_key in baseline_data:
|
||||||
|
return baseline_data[threshold_key]
|
||||||
|
|
||||||
|
# Fall back to general threshold
|
||||||
|
fallback_key = f"d_threshold_{metric_suffix}"
|
||||||
|
if fallback_key in baseline_data:
|
||||||
|
return baseline_data[fallback_key]
|
||||||
|
|
||||||
|
# No threshold found, raise error
|
||||||
|
raise KeyError(
|
||||||
|
f"No threshold found for metric '{metric}'. "
|
||||||
|
f"Expected '{threshold_key}' or '{fallback_key}' in baseline data.")
|
||||||
|
|
||||||
|
|
||||||
def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||||
"""Get regressive test cases
|
"""Get regressive test cases
|
||||||
1. For Maximize metrics, if new perf is below baseline * (1 - threshold)
|
1. For Maximize metrics, if new perf is below baseline * (1 - threshold)
|
||||||
@ -419,8 +431,9 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
|||||||
Set it as regressive.
|
Set it as regressive.
|
||||||
"""
|
"""
|
||||||
regressive_data_list = []
|
regressive_data_list = []
|
||||||
|
cmd_idxs = new_data_dict.keys()
|
||||||
# Find regressive test cases
|
# Find regressive test cases
|
||||||
for cmd_idx in new_data_dict:
|
for cmd_idx in cmd_idxs:
|
||||||
if history_baseline_dict[cmd_idx] is None:
|
if history_baseline_dict[cmd_idx] is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -433,8 +446,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
|||||||
for metric in MAXIMIZE_METRICS:
|
for metric in MAXIMIZE_METRICS:
|
||||||
if metric not in new_data or metric not in baseline_data:
|
if metric not in new_data or metric not in baseline_data:
|
||||||
continue
|
continue
|
||||||
threshold_key = f"d_threshold_{metric[2:]}"
|
threshold = get_threshold(baseline_data, metric)
|
||||||
threshold = baseline_data[threshold_key]
|
|
||||||
baseline_value = baseline_data[metric]
|
baseline_value = baseline_data[metric]
|
||||||
new_value = new_data[metric]
|
new_value = new_data[metric]
|
||||||
# Regressive if new_value < baseline_value * (1 - threshold)
|
# Regressive if new_value < baseline_value * (1 - threshold)
|
||||||
@ -446,8 +458,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
|||||||
for metric in MINIMIZE_METRICS:
|
for metric in MINIMIZE_METRICS:
|
||||||
if metric not in new_data or metric not in baseline_data:
|
if metric not in new_data or metric not in baseline_data:
|
||||||
continue
|
continue
|
||||||
threshold_key = f"d_threshold_{metric[2:]}"
|
threshold = get_threshold(baseline_data, metric)
|
||||||
threshold = baseline_data.get(threshold_key, 0.1)
|
|
||||||
baseline_value = baseline_data[metric]
|
baseline_value = baseline_data[metric]
|
||||||
new_value = new_data[metric]
|
new_value = new_data[metric]
|
||||||
# Regressive if new_value > baseline_value * (1 + threshold)
|
# Regressive if new_value > baseline_value * (1 + threshold)
|
||||||
@ -464,10 +475,16 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
|||||||
baseline_key = f"d_baseline_{metric[2:]}"
|
baseline_key = f"d_baseline_{metric[2:]}"
|
||||||
regressive_data[baseline_key] = baseline_data[metric]
|
regressive_data[baseline_key] = baseline_data[metric]
|
||||||
|
|
||||||
threshold_key = f"d_threshold_{metric[2:]}"
|
# Copy all threshold keys from baseline
|
||||||
if threshold_key in baseline_data:
|
metric_suffix = metric[2:]
|
||||||
regressive_data[threshold_key] = baseline_data[
|
for threshold_key in [
|
||||||
threshold_key]
|
f"d_threshold_{metric_suffix}",
|
||||||
|
f"d_threshold_post_merge_{metric_suffix}",
|
||||||
|
f"d_threshold_pre_merge_{metric_suffix}"
|
||||||
|
]:
|
||||||
|
if threshold_key in baseline_data:
|
||||||
|
regressive_data[threshold_key] = baseline_data[
|
||||||
|
threshold_key]
|
||||||
|
|
||||||
# Add regression info string
|
# Add regression info string
|
||||||
regressive_data["s_regression_info"] = ", ".join(regressive_metrics)
|
regressive_data["s_regression_info"] = ", ".join(regressive_metrics)
|
||||||
@ -478,8 +495,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
|||||||
return regressive_data_list
|
return regressive_data_list
|
||||||
|
|
||||||
|
|
||||||
def prepare_baseline_data(history_baseline_dict, history_data_dict,
|
def prepare_baseline_data(history_data_dict, new_data_dict):
|
||||||
new_data_dict):
|
|
||||||
"""
|
"""
|
||||||
Calculate new baseline from history post-merge data and new data.
|
Calculate new baseline from history post-merge data and new data.
|
||||||
Then return new baseline data.
|
Then return new baseline data.
|
||||||
@ -491,20 +507,19 @@ def prepare_baseline_data(history_baseline_dict, history_data_dict,
|
|||||||
# Calculate best metrics from history post-merge data and new data
|
# Calculate best metrics from history post-merge data and new data
|
||||||
best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx],
|
best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx],
|
||||||
new_data_dict[cmd_idx])
|
new_data_dict[cmd_idx])
|
||||||
new_baseline_data = history_baseline_dict[cmd_idx]
|
new_baseline_data = new_data_dict[cmd_idx].copy()
|
||||||
if new_baseline_data:
|
new_baseline_data["b_is_baseline"] = True
|
||||||
print_info(f"Baseline data found (cmd_idx: {cmd_idx}) in history")
|
# Add or update baseline metrics and thresholds
|
||||||
else:
|
|
||||||
print_info(
|
|
||||||
f"No baseline data found (cmd_idx: {cmd_idx}), created a new baseline"
|
|
||||||
)
|
|
||||||
new_baseline_data = new_data_dict[cmd_idx].copy()
|
|
||||||
new_baseline_data["b_is_baseline"] = True
|
|
||||||
add_id(new_baseline_data)
|
|
||||||
# Add or update baseline metrics
|
|
||||||
for metric, value in best_metrics.items():
|
for metric, value in best_metrics.items():
|
||||||
new_baseline_data[metric] = value
|
new_baseline_data[metric] = value
|
||||||
new_baseline_data[f"d_threshold_{metric[2:]}"] = 0.1
|
metric_suffix = metric[2:]
|
||||||
|
post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
|
||||||
|
pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
|
||||||
|
new_baseline_data[post_merge_key] = new_baseline_data.get(
|
||||||
|
post_merge_key, POST_MERGE_THRESHOLD)
|
||||||
|
new_baseline_data[pre_merge_key] = new_baseline_data.get(
|
||||||
|
pre_merge_key, PRE_MERGE_THRESHOLD)
|
||||||
|
add_id(new_baseline_data)
|
||||||
new_baseline_data_dict[cmd_idx] = new_baseline_data
|
new_baseline_data_dict[cmd_idx] = new_baseline_data
|
||||||
|
|
||||||
return new_baseline_data_dict
|
return new_baseline_data_dict
|
||||||
|
|||||||
185
tests/integration/defs/perf/perf_regression_check.py
Normal file
185
tests/integration/defs/perf/perf_regression_check.py
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
METRICS = [
|
||||||
|
"seq_throughput",
|
||||||
|
"token_throughput",
|
||||||
|
"total_token_throughput",
|
||||||
|
"user_throughput",
|
||||||
|
"mean_tpot",
|
||||||
|
"median_tpot",
|
||||||
|
"p99_tpot",
|
||||||
|
"mean_ttft",
|
||||||
|
"median_ttft",
|
||||||
|
"p99_ttft",
|
||||||
|
"mean_itl",
|
||||||
|
"median_itl",
|
||||||
|
"p99_itl",
|
||||||
|
"mean_e2el",
|
||||||
|
"median_e2el",
|
||||||
|
"p99_e2el",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_execution():
|
||||||
|
disagg_type = os.getenv("DISAGG_SERVING_TYPE", "")
|
||||||
|
if (
|
||||||
|
disagg_type.startswith("GEN")
|
||||||
|
or disagg_type.startswith("CTX")
|
||||||
|
or disagg_type == "DISAGG_SERVER"
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def find_yaml_files(job_workspace, filename):
|
||||||
|
yaml_files = []
|
||||||
|
for root, dirs, files in os.walk(job_workspace):
|
||||||
|
for file in files:
|
||||||
|
if file == filename:
|
||||||
|
yaml_files.append(os.path.join(root, file))
|
||||||
|
return yaml_files
|
||||||
|
|
||||||
|
|
||||||
|
def read_yaml_data(yaml_files):
|
||||||
|
all_data = []
|
||||||
|
for file_path in yaml_files:
|
||||||
|
try:
|
||||||
|
with open(file_path, "r") as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
if data:
|
||||||
|
if isinstance(data, list):
|
||||||
|
all_data.extend(data)
|
||||||
|
else:
|
||||||
|
all_data.append(data)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading {file_path}: {e}")
|
||||||
|
return all_data
|
||||||
|
|
||||||
|
|
||||||
|
def get_metric_keys():
|
||||||
|
metric_keys = set()
|
||||||
|
for metric in METRICS:
|
||||||
|
metric_keys.add(f"d_{metric}")
|
||||||
|
metric_keys.add(f"d_baseline_{metric}")
|
||||||
|
metric_keys.add(f"d_threshold_{metric}")
|
||||||
|
return metric_keys
|
||||||
|
|
||||||
|
|
||||||
|
def print_perf_data(data):
|
||||||
|
print("=== Metrics ===")
|
||||||
|
for metric in METRICS:
|
||||||
|
value_key = f"d_{metric}"
|
||||||
|
if value_key in data:
|
||||||
|
value = data.get(value_key, "N/A")
|
||||||
|
print(f'"{value_key}": {value}')
|
||||||
|
|
||||||
|
metric_keys = get_metric_keys()
|
||||||
|
print("\n=== Config ===")
|
||||||
|
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||||
|
for key in config_keys:
|
||||||
|
value = data[key]
|
||||||
|
print(f'"{key}": {value}')
|
||||||
|
|
||||||
|
|
||||||
|
def print_regression_data(data):
|
||||||
|
if "s_regression_info" in data:
|
||||||
|
print("=== Regression Info ===")
|
||||||
|
print(f"{data['s_regression_info']}")
|
||||||
|
|
||||||
|
metric_keys = get_metric_keys()
|
||||||
|
|
||||||
|
print("=== Metrics ===")
|
||||||
|
for metric in METRICS:
|
||||||
|
value_key = f"d_{metric}"
|
||||||
|
baseline_key = f"d_baseline_{metric}"
|
||||||
|
threshold_key = f"d_threshold_{metric}"
|
||||||
|
# Only print if at least one of the keys exists
|
||||||
|
if value_key in data or baseline_key in data or threshold_key in data:
|
||||||
|
value = data.get(value_key, "N/A")
|
||||||
|
baseline = data.get(baseline_key, "N/A")
|
||||||
|
threshold = data.get(threshold_key, "N/A")
|
||||||
|
# Calculate percentage difference between value and baseline
|
||||||
|
if (
|
||||||
|
isinstance(value, (int, float))
|
||||||
|
and isinstance(baseline, (int, float))
|
||||||
|
and baseline != 0
|
||||||
|
):
|
||||||
|
percentage = (value - baseline) / baseline * 100
|
||||||
|
percentage_str = f"{percentage:+.2f}%"
|
||||||
|
else:
|
||||||
|
percentage_str = "N/A"
|
||||||
|
print(
|
||||||
|
f'"{value_key}": {value}, "{baseline_key}": {baseline}, '
|
||||||
|
f'"{threshold_key}": {threshold}, "diff": {percentage_str}'
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n=== Config ===")
|
||||||
|
config_keys = sorted([key for key in data.keys() if key not in metric_keys])
|
||||||
|
for key in config_keys:
|
||||||
|
if key == "s_regression_info":
|
||||||
|
continue
|
||||||
|
value = data[key]
|
||||||
|
print(f'"{key}": {value}')
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if should_skip_execution():
|
||||||
|
print("Skipping check_perf_regression.py due to DISAGG_SERVING_TYPE")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
job_workspace = sys.argv[1]
|
||||||
|
|
||||||
|
if not os.path.isdir(job_workspace):
|
||||||
|
print(f"Error: {job_workspace} is not a valid directory")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
|
||||||
|
all_perf_data = read_yaml_data(perf_data_files)
|
||||||
|
print(f"Found {len(all_perf_data)} perf data")
|
||||||
|
for i, data in enumerate(all_perf_data):
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Perf Data #{i + 1}")
|
||||||
|
print("=" * 60)
|
||||||
|
print_perf_data(data)
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}\n")
|
||||||
|
|
||||||
|
regression_files = find_yaml_files(job_workspace, "regression.yaml")
|
||||||
|
all_regression_data = read_yaml_data(regression_files)
|
||||||
|
print(f"Found {len(all_regression_data)} regression data")
|
||||||
|
for i, data in enumerate(all_regression_data):
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Regression Data #{i + 1}")
|
||||||
|
print("=" * 60)
|
||||||
|
print_regression_data(data)
|
||||||
|
|
||||||
|
if len(all_regression_data) == 0:
|
||||||
|
print("\n No regression data found. Perf check is successful.")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed."
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@ -19,14 +19,15 @@ import os
|
|||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import socket
|
import socket
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from typing import Dict, List, NamedTuple
|
from typing import Dict, List, NamedTuple
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import yaml
|
import yaml
|
||||||
from defs.common import get_cpp_benchmark
|
from defs.common import get_cpp_benchmark
|
||||||
from defs.trt_test_alternative import (is_linux, is_windows, print_info,
|
from defs.trt_test_alternative import (is_linux, is_windows, print_error,
|
||||||
print_warning)
|
print_info, print_warning)
|
||||||
|
|
||||||
from ..conftest import get_llm_root, llm_models_root, trt_environment
|
from ..conftest import get_llm_root, llm_models_root, trt_environment
|
||||||
from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id,
|
from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id,
|
||||||
@ -227,6 +228,11 @@ def get_model_dir(model_name: str):
|
|||||||
return model_dir
|
return model_dir
|
||||||
|
|
||||||
|
|
||||||
|
def get_dataset_path():
|
||||||
|
return os.path.join(llm_models_root(), "datasets",
|
||||||
|
"ShareGPT_V3_unfiltered_cleaned_split.json")
|
||||||
|
|
||||||
|
|
||||||
def cpu_socket_count_gt_1():
|
def cpu_socket_count_gt_1():
|
||||||
global MAP_BY_SOCKET
|
global MAP_BY_SOCKET
|
||||||
if MAP_BY_SOCKET is not None:
|
if MAP_BY_SOCKET is not None:
|
||||||
@ -319,37 +325,37 @@ BENCH_PERF_METRIC_LOG_QUERIES = {
|
|||||||
|
|
||||||
AGGR_SERVER_PERF_METRIC_LOG_QUERIES = {
|
AGGR_SERVER_PERF_METRIC_LOG_QUERIES = {
|
||||||
PerfMetricType.SEQ_THROUGHPUT:
|
PerfMetricType.SEQ_THROUGHPUT:
|
||||||
re.compile(r"Request throughput \(req\/s\):\s+([\d\.]+)"),
|
re.compile(r"Request throughput \(req\/s\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.TOKEN_THROUGHPUT:
|
PerfMetricType.TOKEN_THROUGHPUT:
|
||||||
re.compile(r"Output token throughput \(tok\/s\):\s+([\d\.]+)"),
|
re.compile(r"Output token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.TOTAL_TOKEN_THROUGHPUT:
|
PerfMetricType.TOTAL_TOKEN_THROUGHPUT:
|
||||||
re.compile(r"Total Token throughput \(tok\/s\):\s+([\d\.]+)"),
|
re.compile(r"Total Token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.USER_THROUGHPUT:
|
PerfMetricType.USER_THROUGHPUT:
|
||||||
re.compile(r"User throughput \(tok\/s\):\s+([\d\.]+)"),
|
re.compile(r"User throughput \(tok\/s\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.FIRST_TOKEN_TIME:
|
PerfMetricType.FIRST_TOKEN_TIME:
|
||||||
re.compile(r"Mean TTFT \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Mean TTFT \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.MEDIAN_FIRST_TOKEN_TIME:
|
PerfMetricType.MEDIAN_FIRST_TOKEN_TIME:
|
||||||
re.compile(r"Median TTFT \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Median TTFT \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.P99_FIRST_TOKEN_TIME:
|
PerfMetricType.P99_FIRST_TOKEN_TIME:
|
||||||
re.compile(r"P99 TTFT \(ms\):\s+([\d\.]+)"),
|
re.compile(r"P99 TTFT \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.INTER_TOKEN_TIME:
|
PerfMetricType.INTER_TOKEN_TIME:
|
||||||
re.compile(r"Mean ITL \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Mean ITL \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.MEDIAN_INTER_TOKEN_TIME:
|
PerfMetricType.MEDIAN_INTER_TOKEN_TIME:
|
||||||
re.compile(r"Median ITL \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Median ITL \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.P99_INTER_TOKEN_TIME:
|
PerfMetricType.P99_INTER_TOKEN_TIME:
|
||||||
re.compile(r"P99 ITL \(ms\):\s+([\d\.]+)"),
|
re.compile(r"P99 ITL \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.OUTPUT_TOKEN_TIME:
|
PerfMetricType.OUTPUT_TOKEN_TIME:
|
||||||
re.compile(r"Mean TPOT \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Mean TPOT \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME:
|
PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME:
|
||||||
re.compile(r"Median TPOT \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Median TPOT \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.P99_OUTPUT_TOKEN_TIME:
|
PerfMetricType.P99_OUTPUT_TOKEN_TIME:
|
||||||
re.compile(r"P99 TPOT \(ms\):\s+([\d\.]+)"),
|
re.compile(r"P99 TPOT \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.INFERENCE_TIME:
|
PerfMetricType.INFERENCE_TIME:
|
||||||
re.compile(r"Mean E2EL \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Mean E2EL \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.MEDIAN_INFERENCE_TIME:
|
PerfMetricType.MEDIAN_INFERENCE_TIME:
|
||||||
re.compile(r"Median E2EL \(ms\):\s+([\d\.]+)"),
|
re.compile(r"Median E2EL \(ms\):\s+(-?[\d\.]+)"),
|
||||||
PerfMetricType.P99_INFERENCE_TIME:
|
PerfMetricType.P99_INFERENCE_TIME:
|
||||||
re.compile(r"P99 E2EL \(ms\):\s+([\d\.]+)"),
|
re.compile(r"P99 E2EL \(ms\):\s+(-?[\d\.]+)"),
|
||||||
}
|
}
|
||||||
|
|
||||||
# (Relative threshold, Absolute threshold) for all metric types
|
# (Relative threshold, Absolute threshold) for all metric types
|
||||||
@ -512,17 +518,21 @@ class ServerConfig:
|
|||||||
|
|
||||||
def __init__(self, server_config_data: dict, env_vars: str = ""):
|
def __init__(self, server_config_data: dict, env_vars: str = ""):
|
||||||
# Extract required fields
|
# Extract required fields
|
||||||
|
self.mode = server_config_data.get('mode', 'e2e')
|
||||||
|
self.concurrency = server_config_data.get('concurrency', 1)
|
||||||
self.name = server_config_data['name']
|
self.name = server_config_data['name']
|
||||||
self.model_name = server_config_data['model_name']
|
self.model_name = server_config_data['model_name']
|
||||||
self.gpus = server_config_data['gpus']
|
|
||||||
self.model_path = ""
|
self.model_path = ""
|
||||||
self.env_vars = env_vars
|
self.env_vars = env_vars
|
||||||
|
|
||||||
# Extract optional fields with defaults
|
# Extract optional fields with defaults
|
||||||
self.tp = server_config_data.get('tensor_parallel_size', self.gpus)
|
self.tp = server_config_data.get('tensor_parallel_size', 1)
|
||||||
self.ep = server_config_data.get('moe_expert_parallel_size', 1)
|
self.ep = server_config_data.get('moe_expert_parallel_size', 1)
|
||||||
self.pp = server_config_data.get('pipeline_parallel_size', 1)
|
self.pp = server_config_data.get('pipeline_parallel_size', 1)
|
||||||
self.gpus_per_node = server_config_data.get('gpus_per_node', self.gpus)
|
self.cp = server_config_data.get('context_parallel_size', 1)
|
||||||
|
self.gpus = server_config_data.get('gpus', self.tp * self.cp * self.pp)
|
||||||
|
self.gpus_per_node = server_config_data.get('gpus_per_node',
|
||||||
|
0) or self.gpus
|
||||||
self.max_num_tokens = server_config_data.get('max_num_tokens', 2048)
|
self.max_num_tokens = server_config_data.get('max_num_tokens', 2048)
|
||||||
self.max_batch_size = server_config_data.get('max_batch_size', 512)
|
self.max_batch_size = server_config_data.get('max_batch_size', 512)
|
||||||
self.max_seq_len = server_config_data.get('max_seq_len', 0)
|
self.max_seq_len = server_config_data.get('max_seq_len', 0)
|
||||||
@ -538,6 +548,8 @@ class ServerConfig:
|
|||||||
'enable_attention_dp', False)
|
'enable_attention_dp', False)
|
||||||
self.trust_remote_code = server_config_data.get('trust_remote_code',
|
self.trust_remote_code = server_config_data.get('trust_remote_code',
|
||||||
False)
|
False)
|
||||||
|
self.enable_lm_head_tp_in_adp = server_config_data.get(
|
||||||
|
'enable_lm_head_tp_in_adp', False)
|
||||||
|
|
||||||
# attention_dp_config
|
# attention_dp_config
|
||||||
attention_dp_config = server_config_data.get('attention_dp_config', {})
|
attention_dp_config = server_config_data.get('attention_dp_config', {})
|
||||||
@ -551,6 +563,12 @@ class ServerConfig:
|
|||||||
moe_config = server_config_data.get('moe_config', {})
|
moe_config = server_config_data.get('moe_config', {})
|
||||||
self.moe_backend = moe_config.get('backend', "")
|
self.moe_backend = moe_config.get('backend', "")
|
||||||
self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0)
|
self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0)
|
||||||
|
self.use_low_precision_moe_combine = moe_config.get(
|
||||||
|
'use_low_precision_moe_combine', False)
|
||||||
|
load_balancer_config = moe_config.get('load_balancer', {})
|
||||||
|
self.load_balancer_num_slots = load_balancer_config.get('num_slots', 0)
|
||||||
|
self.load_balancer_layer_updates_per_iter = load_balancer_config.get(
|
||||||
|
'layer_updates_per_iter', 0)
|
||||||
|
|
||||||
# cuda_graph_config
|
# cuda_graph_config
|
||||||
cuda_graph_config = server_config_data.get('cuda_graph_config', {})
|
cuda_graph_config = server_config_data.get('cuda_graph_config', {})
|
||||||
@ -605,10 +623,13 @@ class ServerConfig:
|
|||||||
self.match_mode = server_config_data.get('match_mode', "config")
|
self.match_mode = server_config_data.get('match_mode', "config")
|
||||||
|
|
||||||
# Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs)
|
# Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs)
|
||||||
|
exclude_keys = [
|
||||||
|
'mode', 'concurrency', 'name', 'model_name', 'gpus',
|
||||||
|
'gpus_per_node', 'client_configs'
|
||||||
|
]
|
||||||
self.extra_llm_api_config_data = {
|
self.extra_llm_api_config_data = {
|
||||||
k: v
|
k: v
|
||||||
for k, v in server_config_data.items()
|
for k, v in server_config_data.items() if k not in exclude_keys
|
||||||
if k not in ['name', 'model_name', 'gpus', 'client_configs']
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def to_cmd(self,
|
def to_cmd(self,
|
||||||
@ -634,8 +655,41 @@ class ServerConfig:
|
|||||||
def to_env(self) -> Dict[str, str]:
|
def to_env(self) -> Dict[str, str]:
|
||||||
return to_env_dict(self.env_vars)
|
return to_env_dict(self.env_vars)
|
||||||
|
|
||||||
|
def to_match_keys(self) -> List[str]:
|
||||||
|
return [
|
||||||
|
"s_mode",
|
||||||
|
"s_model_name",
|
||||||
|
"l_tp",
|
||||||
|
"l_ep",
|
||||||
|
"l_pp",
|
||||||
|
"l_cp",
|
||||||
|
"l_gpus_per_node",
|
||||||
|
"l_max_batch_size",
|
||||||
|
"b_disable_overlap_scheduler",
|
||||||
|
"l_num_postprocess_workers",
|
||||||
|
"s_attn_backend",
|
||||||
|
"b_enable_chunked_prefill",
|
||||||
|
"b_enable_attention_dp",
|
||||||
|
"b_enable_lm_head_tp_in_adp",
|
||||||
|
# attention_dp_config
|
||||||
|
"b_attention_dp_balance",
|
||||||
|
# moe_config
|
||||||
|
"s_moe_backend",
|
||||||
|
# cuda_graph_config
|
||||||
|
"b_enable_cuda_graph",
|
||||||
|
# kv_cache_config
|
||||||
|
"s_kv_cache_dtype",
|
||||||
|
# cache_transceiver_config
|
||||||
|
"s_cache_transceiver_backend"
|
||||||
|
# speculative_config
|
||||||
|
"s_spec_decoding_type",
|
||||||
|
"l_num_nextn_predict_layers",
|
||||||
|
]
|
||||||
|
|
||||||
def to_db_data(self) -> dict:
|
def to_db_data(self) -> dict:
|
||||||
db_data = {
|
db_data = {
|
||||||
|
"s_mode":
|
||||||
|
self.mode,
|
||||||
"s_model_name":
|
"s_model_name":
|
||||||
self.model_name.lower(),
|
self.model_name.lower(),
|
||||||
"l_gpus":
|
"l_gpus":
|
||||||
@ -646,6 +700,8 @@ class ServerConfig:
|
|||||||
self.ep,
|
self.ep,
|
||||||
"l_pp":
|
"l_pp":
|
||||||
self.pp,
|
self.pp,
|
||||||
|
"l_cp":
|
||||||
|
self.cp,
|
||||||
"l_gpus_per_node":
|
"l_gpus_per_node":
|
||||||
self.gpus_per_node,
|
self.gpus_per_node,
|
||||||
"l_max_num_tokens":
|
"l_max_num_tokens":
|
||||||
@ -668,6 +724,8 @@ class ServerConfig:
|
|||||||
self.enable_attention_dp,
|
self.enable_attention_dp,
|
||||||
"b_trust_remote_code":
|
"b_trust_remote_code":
|
||||||
self.trust_remote_code,
|
self.trust_remote_code,
|
||||||
|
"b_enable_lm_head_tp_in_adp":
|
||||||
|
self.enable_lm_head_tp_in_adp,
|
||||||
# attention_dp_config
|
# attention_dp_config
|
||||||
"b_attention_dp_balance":
|
"b_attention_dp_balance":
|
||||||
self.attention_dp_balance,
|
self.attention_dp_balance,
|
||||||
@ -680,6 +738,12 @@ class ServerConfig:
|
|||||||
self.moe_backend,
|
self.moe_backend,
|
||||||
"l_moe_max_num_tokens":
|
"l_moe_max_num_tokens":
|
||||||
self.moe_max_num_tokens,
|
self.moe_max_num_tokens,
|
||||||
|
"b_use_low_precision_moe_combine":
|
||||||
|
self.use_low_precision_moe_combine,
|
||||||
|
"l_load_balancer_num_slots":
|
||||||
|
self.load_balancer_num_slots,
|
||||||
|
"l_load_balancer_layer_updates_per_iter":
|
||||||
|
self.load_balancer_layer_updates_per_iter,
|
||||||
# cuda_graph_config
|
# cuda_graph_config
|
||||||
"b_enable_cuda_graph":
|
"b_enable_cuda_graph":
|
||||||
self.enable_cuda_graph,
|
self.enable_cuda_graph,
|
||||||
@ -754,7 +818,7 @@ class ClientConfig:
|
|||||||
self.osl = client_config_data.get('osl', 1024)
|
self.osl = client_config_data.get('osl', 1024)
|
||||||
self.random_range_ratio = client_config_data.get(
|
self.random_range_ratio = client_config_data.get(
|
||||||
'random_range_ratio', 0.0)
|
'random_range_ratio', 0.0)
|
||||||
self.backend = client_config_data.get('backend', "")
|
self.backend = client_config_data.get('backend', "openai")
|
||||||
self.use_chat_template = client_config_data.get('use_chat_template',
|
self.use_chat_template = client_config_data.get('use_chat_template',
|
||||||
False)
|
False)
|
||||||
self.streaming = client_config_data.get('streaming', True)
|
self.streaming = client_config_data.get('streaming', True)
|
||||||
@ -765,18 +829,36 @@ class ClientConfig:
|
|||||||
model_dir = get_model_dir(self.model_name)
|
model_dir = get_model_dir(self.model_name)
|
||||||
self.model_path = model_dir if os.path.exists(
|
self.model_path = model_dir if os.path.exists(
|
||||||
model_dir) else self.model_name
|
model_dir) else self.model_name
|
||||||
|
dataset_path = get_dataset_path()
|
||||||
benchmark_cmd = [
|
benchmark_cmd = [
|
||||||
"python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving",
|
"python",
|
||||||
"--model", self.model_path, "--dataset-name", "random",
|
"-m",
|
||||||
"--random-ids", "--num-prompts",
|
"tensorrt_llm.serve.scripts.benchmark_serving",
|
||||||
str(self.concurrency * self.iterations), "--random-input-len",
|
"--model",
|
||||||
str(self.isl), "--random-output-len",
|
self.model_path,
|
||||||
str(self.osl), "--random-range-ratio",
|
"--tokenizer",
|
||||||
str(self.random_range_ratio), "--ignore-eos",
|
self.model_path,
|
||||||
"--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency",
|
"--dataset-name",
|
||||||
str(self.concurrency)
|
"random",
|
||||||
|
"--random-ids",
|
||||||
|
"--num-prompts",
|
||||||
|
str(self.concurrency * self.iterations),
|
||||||
|
"--max-concurrency",
|
||||||
|
str(self.concurrency),
|
||||||
|
"--random-input-len",
|
||||||
|
str(self.isl),
|
||||||
|
"--random-output-len",
|
||||||
|
str(self.osl),
|
||||||
|
"--random-range-ratio",
|
||||||
|
str(self.random_range_ratio),
|
||||||
|
"--trust-remote-code",
|
||||||
|
"--ignore-eos",
|
||||||
|
"--percentile-metrics",
|
||||||
|
"ttft,tpot,itl,e2el",
|
||||||
]
|
]
|
||||||
|
if dataset_path and os.path.exists(dataset_path):
|
||||||
|
benchmark_cmd.append("--dataset-path")
|
||||||
|
benchmark_cmd.append(dataset_path)
|
||||||
if self.backend:
|
if self.backend:
|
||||||
benchmark_cmd.append("--backend")
|
benchmark_cmd.append("--backend")
|
||||||
benchmark_cmd.append(self.backend)
|
benchmark_cmd.append(self.backend)
|
||||||
@ -789,6 +871,18 @@ class ClientConfig:
|
|||||||
def to_env(self) -> Dict[str, str]:
|
def to_env(self) -> Dict[str, str]:
|
||||||
return to_env_dict(self.env_vars)
|
return to_env_dict(self.env_vars)
|
||||||
|
|
||||||
|
def to_match_keys(self) -> List[str]:
|
||||||
|
return [
|
||||||
|
"l_concurrency",
|
||||||
|
"l_iterations",
|
||||||
|
"l_isl",
|
||||||
|
"l_osl",
|
||||||
|
"d_random_range_ratio",
|
||||||
|
"s_backend",
|
||||||
|
"b_use_chat_template",
|
||||||
|
"b_streaming",
|
||||||
|
]
|
||||||
|
|
||||||
def to_db_data(self) -> dict:
|
def to_db_data(self) -> dict:
|
||||||
"""Convert ClientConfig to Database data"""
|
"""Convert ClientConfig to Database data"""
|
||||||
db_data = {
|
db_data = {
|
||||||
@ -867,36 +961,37 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None):
|
|||||||
else:
|
else:
|
||||||
execution_plan = None
|
execution_plan = None
|
||||||
|
|
||||||
# Read YAML config file
|
|
||||||
with open(config_file_path, 'r') as f:
|
with open(config_file_path, 'r') as f:
|
||||||
config = yaml.safe_load(f)
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
# Read environment config
|
metadata = config.get('metadata', {})
|
||||||
environment = config.get('environment', {})
|
environment = config.get('environment', {})
|
||||||
if not environment:
|
hardware = config.get('hardware', {})
|
||||||
environment = {}
|
gpus_per_node = hardware.get('gpus_per_node', 0)
|
||||||
|
|
||||||
# Get environment variables
|
model_name = metadata.get('model_name', '')
|
||||||
environment.get('worker_env_var', '')
|
|
||||||
server_env_var = environment.get('server_env_var', '')
|
server_env_var = environment.get('server_env_var', '')
|
||||||
client_env_var = environment.get('client_env_var', '')
|
client_env_var = environment.get('client_env_var', '')
|
||||||
|
|
||||||
server_configs = []
|
server_configs = []
|
||||||
server_client_configs = {}
|
server_client_configs = {}
|
||||||
|
|
||||||
for server_config_data in config['server_configs']:
|
for server_config_data in config['server_configs']:
|
||||||
server_name = server_config_data['name']
|
server_name = server_config_data['name']
|
||||||
|
server_config_data[
|
||||||
|
'model_name'] = model_name if 'model_name' not in server_config_data else server_config_data[
|
||||||
|
'model_name']
|
||||||
|
server_config_data['mode'] = 'e2e'
|
||||||
|
server_config_data['concurrency'] = -1
|
||||||
|
server_config_data['gpus_per_node'] = gpus_per_node
|
||||||
|
|
||||||
# Check if this server should be included based on execution_plan
|
# Check if this server should be included based on execution_plan
|
||||||
if execution_plan is not None and server_name not in execution_plan:
|
if execution_plan is not None and server_name not in execution_plan:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create ServerConfig object directly from dict
|
|
||||||
server_config = ServerConfig(server_config_data, server_env_var)
|
server_config = ServerConfig(server_config_data, server_env_var)
|
||||||
server_id = len(server_configs)
|
server_id = len(server_configs)
|
||||||
server_configs.append(server_config)
|
server_configs.append(server_config)
|
||||||
|
|
||||||
# Create ClientConfig objects
|
|
||||||
client_configs = []
|
client_configs = []
|
||||||
selected_client_names = execution_plan.get(
|
selected_client_names = execution_plan.get(
|
||||||
server_name) if execution_plan else None
|
server_name) if execution_plan else None
|
||||||
@ -905,7 +1000,6 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None):
|
|||||||
client_name = client_config_data['name']
|
client_name = client_config_data['name']
|
||||||
|
|
||||||
# Check if this client should be included
|
# Check if this client should be included
|
||||||
# Include if: execution_plan is None OR selected_client_names is None OR client_name in selected_client_names
|
|
||||||
if execution_plan is not None and selected_client_names is not None:
|
if execution_plan is not None and selected_client_names is not None:
|
||||||
if client_name not in selected_client_names:
|
if client_name not in selected_client_names:
|
||||||
continue
|
continue
|
||||||
@ -929,46 +1023,48 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
|
|||||||
config = yaml.safe_load(f)
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
disagg_configs = []
|
disagg_configs = []
|
||||||
|
metadata = config.get('metadata', {})
|
||||||
hardware = config.get('hardware', {})
|
hardware = config.get('hardware', {})
|
||||||
benchmark = config.get('benchmark', {})
|
benchmark = config.get('benchmark', {})
|
||||||
environment = config.get('environment', {})
|
environment = config.get('environment', {})
|
||||||
slurm_config = config.get('slurm', {})
|
slurm_config = config.get('slurm', {})
|
||||||
worker_config = config.get('worker_config', {})
|
worker_config = config.get('worker_config', {})
|
||||||
timeout = slurm_config.get('timeout', 3600)
|
timeout = slurm_config.get('timeout', 7200)
|
||||||
numa_bind = slurm_config.get('numa_bind', False)
|
numa_bind = slurm_config.get('numa_bind', False)
|
||||||
|
gpus_per_node = hardware.get('gpus_per_node', 0)
|
||||||
|
model_name = metadata.get('model_name', '')
|
||||||
|
assert model_name, "model_name is required in metadata section"
|
||||||
|
|
||||||
# Get model name from environment
|
benchmark_mode = benchmark.get('mode', 'e2e')
|
||||||
model_name = environment.get('model_name', '')
|
if "gen_only" in benchmark_mode:
|
||||||
assert model_name, "model_name is required in environment section"
|
hardware['num_ctx_servers'] = 0
|
||||||
|
|
||||||
# Get environment variables
|
|
||||||
worker_env_var = environment.get('worker_env_var', '')
|
worker_env_var = environment.get('worker_env_var', '')
|
||||||
server_env_var = environment.get('server_env_var', '')
|
server_env_var = environment.get('server_env_var', '')
|
||||||
client_env_var = environment.get('client_env_var', '')
|
client_env_var = environment.get('client_env_var', '')
|
||||||
|
|
||||||
# Create ctx_server config data
|
concurrency_str = benchmark.get('concurrency_list', '1')
|
||||||
|
if isinstance(concurrency_str, str):
|
||||||
|
concurrency = max(int(x) for x in concurrency_str.split())
|
||||||
|
else:
|
||||||
|
concurrency = int(concurrency_str)
|
||||||
|
|
||||||
ctx_server_config_data = {
|
ctx_server_config_data = {
|
||||||
|
'mode': benchmark_mode,
|
||||||
|
'concurrency': concurrency,
|
||||||
'name': 'ctx',
|
'name': 'ctx',
|
||||||
'model_name': model_name,
|
'model_name': model_name,
|
||||||
'gpus': hardware.get('gpus_per_ctx_server'),
|
'gpus_per_node': gpus_per_node,
|
||||||
'gpus_per_node': hardware.get('gpus_per_node'),
|
|
||||||
**worker_config.get('ctx', {})
|
**worker_config.get('ctx', {})
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create gen_server config data
|
|
||||||
gen_server_config_data = {
|
gen_server_config_data = {
|
||||||
|
'mode': benchmark_mode,
|
||||||
|
'concurrency': concurrency,
|
||||||
'name': 'gen',
|
'name': 'gen',
|
||||||
'model_name': model_name,
|
'model_name': model_name,
|
||||||
'gpus': hardware.get('gpus_per_gen_server'),
|
'gpus_per_node': gpus_per_node,
|
||||||
'gpus_per_node': hardware.get('gpus_per_node'),
|
|
||||||
**worker_config.get('gen', {})
|
**worker_config.get('gen', {})
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create client config data
|
|
||||||
concurrency_str = benchmark.get('concurrency_list', '1')
|
|
||||||
concurrency = int(concurrency_str) if isinstance(concurrency_str,
|
|
||||||
str) else concurrency_str
|
|
||||||
|
|
||||||
client_config_data = {
|
client_config_data = {
|
||||||
'name': 'client',
|
'name': 'client',
|
||||||
'concurrency': concurrency,
|
'concurrency': concurrency,
|
||||||
@ -980,13 +1076,12 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
|
|||||||
'use_chat_template': False,
|
'use_chat_template': False,
|
||||||
'streaming': benchmark.get('streaming', True),
|
'streaming': benchmark.get('streaming', True),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create disagg_config dict
|
|
||||||
disagg_config = {
|
disagg_config = {
|
||||||
'disagg_serving_type': disagg_serving_type,
|
'disagg_serving_type': disagg_serving_type,
|
||||||
'hostname': socket.gethostname(),
|
'hostname': socket.gethostname(),
|
||||||
'numa_bind': numa_bind,
|
'numa_bind': numa_bind,
|
||||||
'timeout': timeout,
|
'timeout': timeout,
|
||||||
|
'mode': benchmark_mode,
|
||||||
'name': 'disagg_config',
|
'name': 'disagg_config',
|
||||||
'model_name': model_name,
|
'model_name': model_name,
|
||||||
'hardware': hardware,
|
'hardware': hardware,
|
||||||
@ -995,9 +1090,7 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
|
|||||||
'server_env_var': server_env_var,
|
'server_env_var': server_env_var,
|
||||||
'client': ClientConfig(client_config_data, model_name, client_env_var),
|
'client': ClientConfig(client_config_data, model_name, client_env_var),
|
||||||
}
|
}
|
||||||
print_info(f"disagg_config: {disagg_config}")
|
|
||||||
disagg_configs.append(disagg_config)
|
disagg_configs.append(disagg_config)
|
||||||
|
|
||||||
return disagg_configs
|
return disagg_configs
|
||||||
|
|
||||||
|
|
||||||
@ -1114,6 +1207,8 @@ class PerfTestConfig:
|
|||||||
self.upload_to_db = False
|
self.upload_to_db = False
|
||||||
self.config_file = None
|
self.config_file = None
|
||||||
self.gpu_type = None
|
self.gpu_type = None
|
||||||
|
self.config_dir = None
|
||||||
|
self.config_file = None
|
||||||
self.config_path = None
|
self.config_path = None
|
||||||
self.select_pattern = None
|
self.select_pattern = None
|
||||||
# Aggregated mode
|
# Aggregated mode
|
||||||
@ -1330,35 +1425,47 @@ class PerfTestConfig:
|
|||||||
# Extract configs from test param labels.
|
# Extract configs from test param labels.
|
||||||
labels = test_param_labels.split("-")
|
labels = test_param_labels.split("-")
|
||||||
|
|
||||||
def get_gpu_type(label: str) -> str:
|
def get_gpu_type() -> str:
|
||||||
parts = label.split("_")
|
try:
|
||||||
if len(parts) < 2 or parts[0] != "l0":
|
output = subprocess.check_output(["nvidia-smi", "-L"],
|
||||||
return ""
|
stderr=subprocess.DEVNULL,
|
||||||
if parts[1] == "dgx":
|
text=True)
|
||||||
if len(parts) >= 3:
|
first_line = output.strip().split("\n")[0]
|
||||||
gpu_type = f"{parts[1]}_{parts[2]}"
|
gpu_models = ["GB300", "GB200", "B300", "B200"]
|
||||||
else:
|
for model in gpu_models:
|
||||||
gpu_type = ""
|
if model in first_line:
|
||||||
else:
|
if model.startswith("B") and not model.startswith("GB"):
|
||||||
gpu_type = parts[1]
|
return f"dgx_{model.lower()}"
|
||||||
return gpu_type.lower()
|
return model.lower()
|
||||||
|
except (subprocess.CalledProcessError, FileNotFoundError,
|
||||||
|
IndexError):
|
||||||
|
print_error(
|
||||||
|
f"Failed to get GPU type: {subprocess.CalledProcessError}")
|
||||||
|
return ""
|
||||||
|
|
||||||
# Used for perf sanity test
|
|
||||||
if "perf_sanity" in labels[0]:
|
if "perf_sanity" in labels[0]:
|
||||||
assert len(labels) > 1, "perf_sanity test must have a config file!"
|
assert len(labels) > 1, "perf_sanity test must have a config file!"
|
||||||
|
is_disagg = "disagg" in labels[0]
|
||||||
self.upload_to_db = "upload" in labels[0]
|
self.upload_to_db = "upload" in labels[0]
|
||||||
self.config_file = labels[1]
|
self.gpu_type = get_gpu_type()
|
||||||
if "disagg" in labels[1]:
|
if is_disagg:
|
||||||
|
# For disagg, test name is like: perf_sanity_disagg-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX
|
||||||
|
# labels[0] is perf_sanity_disagg, "-".join(labels[1:]) is config file base name
|
||||||
self.runtime = "multi_node_disagg_server"
|
self.runtime = "multi_node_disagg_server"
|
||||||
|
self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
|
||||||
|
config_base = "-".join(labels[1:])
|
||||||
|
self.config_file = f"{config_base}.yaml" if not config_base.endswith(
|
||||||
|
".yaml") else config_base
|
||||||
|
self.select_pattern = None
|
||||||
else:
|
else:
|
||||||
|
# For aggr, test name is like: perf_sanity_aggr-l0_dgx_b300-r1_fp8_dep8_mtp1_1k1k
|
||||||
|
# labels[0] is perf_sanity_aggr, labels[1] is config file base name, labels[2] is select_pattern (optional)
|
||||||
self.runtime = "aggr_server"
|
self.runtime = "aggr_server"
|
||||||
self.gpu_type = get_gpu_type(labels[1])
|
self.config_dir = "tests/scripts/perf-sanity"
|
||||||
config_folder = os.getenv("TRTLLM_CONFIG_FOLDER",
|
config_base = labels[1]
|
||||||
"tests/scripts/perf-sanity")
|
self.config_file = f"{config_base}.yaml" if config_base and not config_base.endswith(
|
||||||
self.config_path = os.path.join(
|
".yaml") else config_base
|
||||||
config_folder, f"{labels[1]}.yaml"
|
self.select_pattern = labels[2] if len(labels) > 2 else None
|
||||||
if not labels[1].endswith(".yaml") else labels[1])
|
|
||||||
self.select_pattern = labels[2] if len(labels) > 2 else None
|
|
||||||
return
|
return
|
||||||
|
|
||||||
self.model_name = labels.pop(0)
|
self.model_name = labels.pop(0)
|
||||||
@ -1578,21 +1685,19 @@ class PerfTestConfig:
|
|||||||
[b >= 32 for b in self.batch_sizes]
|
[b >= 32 for b in self.batch_sizes]
|
||||||
), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32."
|
), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32."
|
||||||
|
|
||||||
def set_aggr_server_configs(self, llm_root: str) -> None:
|
def set_aggr_server_configs(self) -> None:
|
||||||
"""
|
"""
|
||||||
Set the server and client configs.
|
Set the server and client configs.
|
||||||
"""
|
"""
|
||||||
config_file_path = os.path.join(llm_root, self.config_path)
|
|
||||||
_, self.server_configs, self.server_client_configs = parse_aggr_config_file(
|
_, self.server_configs, self.server_client_configs = parse_aggr_config_file(
|
||||||
config_file_path, self.select_pattern)
|
self.config_path, self.select_pattern)
|
||||||
|
|
||||||
def set_multi_node_disagg_server_configs(self, llm_root: str) -> None:
|
def set_multi_node_disagg_server_configs(self) -> None:
|
||||||
"""
|
"""
|
||||||
Set the multi-node disaggregated server configs.
|
Set the multi-node disaggregated server configs.
|
||||||
"""
|
"""
|
||||||
config_file_path = os.path.join(llm_root, self.config_path)
|
|
||||||
self.disagg_configs = parse_multi_node_disagg_config_file(
|
self.disagg_configs = parse_multi_node_disagg_config_file(
|
||||||
config_file_path, self.select_pattern)
|
self.config_path, self.select_pattern)
|
||||||
|
|
||||||
def get_model_family(self) -> str:
|
def get_model_family(self) -> str:
|
||||||
"""
|
"""
|
||||||
@ -1682,6 +1787,13 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
output_dir,
|
output_dir,
|
||||||
perf_cache_fpath,
|
perf_cache_fpath,
|
||||||
gpu_clock_lock=None) -> None:
|
gpu_clock_lock=None) -> None:
|
||||||
|
if self._config.runtime == "aggr_server" or self._config.runtime == "multi_node_disagg_server":
|
||||||
|
self._config.config_dir = os.getenv(
|
||||||
|
"TRTLLM_CONFIG_FOLDER",
|
||||||
|
os.path.join(llm_root, self._config.config_dir))
|
||||||
|
self._config.config_path = os.path.join(self._config.config_dir,
|
||||||
|
self._config.config_file)
|
||||||
|
|
||||||
if self._config.runtime == "cpp":
|
if self._config.runtime == "cpp":
|
||||||
if not self._config.is_bert_like():
|
if not self._config.is_bert_like():
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -1695,12 +1807,12 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
benchmark_script = "trtllm-bench"
|
benchmark_script = "trtllm-bench"
|
||||||
elif self._config.runtime == "aggr_server":
|
elif self._config.runtime == "aggr_server":
|
||||||
benchmark_script = None
|
benchmark_script = None
|
||||||
self._config.set_aggr_server_configs(llm_root)
|
self._config.set_aggr_server_configs()
|
||||||
elif self._config.runtime == "disagg_server":
|
elif self._config.runtime == "disagg_server":
|
||||||
benchmark_script = None
|
benchmark_script = None
|
||||||
elif self._config.runtime == "multi_node_disagg_server":
|
elif self._config.runtime == "multi_node_disagg_server":
|
||||||
benchmark_script = None
|
benchmark_script = None
|
||||||
self._config.set_multi_node_disagg_server_configs(llm_root)
|
self._config.set_multi_node_disagg_server_configs()
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Invalid runtime {self._config.runtime}.")
|
raise RuntimeError(f"Invalid runtime {self._config.runtime}.")
|
||||||
|
|
||||||
@ -1730,15 +1842,12 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
|
|
||||||
def get_trtllm_aggr_commands(self, output_dir):
|
def get_trtllm_aggr_commands(self, output_dir):
|
||||||
server_cmds = []
|
server_cmds = []
|
||||||
server_envs = []
|
|
||||||
client_cmds = []
|
client_cmds = []
|
||||||
client_envs = []
|
|
||||||
names = []
|
names = []
|
||||||
for server_idx, client_configs in self._config.server_client_configs.items(
|
for server_idx, client_configs in self._config.server_client_configs.items(
|
||||||
):
|
):
|
||||||
server_config = self._config.server_configs[server_idx]
|
server_config = self._config.server_configs[server_idx]
|
||||||
server_cmd = server_config.to_cmd(output_dir)
|
server_cmd = server_config.to_cmd(output_dir)
|
||||||
server_env = server_config.to_env()
|
|
||||||
# Generate extra-llm-api-config.yml
|
# Generate extra-llm-api-config.yml
|
||||||
config_content = server_config.generate_extra_llm_api_config()
|
config_content = server_config.generate_extra_llm_api_config()
|
||||||
config_filename = f"extra-llm-api-config.{server_config.name}.yml"
|
config_filename = f"extra-llm-api-config.{server_config.name}.yml"
|
||||||
@ -1747,49 +1856,35 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
f.write(config_content)
|
f.write(config_content)
|
||||||
for client_config in client_configs:
|
for client_config in client_configs:
|
||||||
server_cmds.append(server_cmd)
|
server_cmds.append(server_cmd)
|
||||||
server_envs.append(server_env)
|
|
||||||
client_cmd = client_config.to_cmd()
|
client_cmd = client_config.to_cmd()
|
||||||
client_env = client_config.to_env()
|
|
||||||
client_cmds.append(client_cmd)
|
client_cmds.append(client_cmd)
|
||||||
client_envs.append(client_env)
|
|
||||||
names.append(f"{server_config.name}-{client_config.name}")
|
names.append(f"{server_config.name}-{client_config.name}")
|
||||||
return server_cmds, server_envs, client_cmds, client_envs, names
|
return server_cmds, client_cmds, names
|
||||||
|
|
||||||
def get_trtllm_multi_node_disagg_commands(self, output_dir):
|
def get_trtllm_multi_node_disagg_commands(self, output_dir):
|
||||||
ctx_server_cmds = []
|
ctx_server_cmds = []
|
||||||
ctx_server_envs = []
|
|
||||||
gen_server_cmds = []
|
gen_server_cmds = []
|
||||||
gen_server_envs = []
|
|
||||||
disagg_server_cmds = []
|
disagg_server_cmds = []
|
||||||
disagg_server_envs = []
|
|
||||||
benchmark_cmds = []
|
benchmark_cmds = []
|
||||||
benchmark_envs = []
|
|
||||||
cmd_idx = 0
|
cmd_idx = 0
|
||||||
for disagg_config in self._config.disagg_configs:
|
for disagg_config in self._config.disagg_configs:
|
||||||
disagg_serving_type = disagg_config['disagg_serving_type']
|
disagg_serving_type = disagg_config['disagg_serving_type']
|
||||||
disagg_config['hostname']
|
disagg_config['hostname']
|
||||||
numa_bind = disagg_config['numa_bind']
|
numa_bind = disagg_config['numa_bind']
|
||||||
ctx_server_cmd = None
|
ctx_server_cmd = None
|
||||||
ctx_server_env = None
|
|
||||||
gen_server_cmd = None
|
gen_server_cmd = None
|
||||||
gen_server_env = None
|
|
||||||
disagg_server_cmd = None
|
disagg_server_cmd = None
|
||||||
disagg_server_env = None
|
|
||||||
benchmark_cmd = None
|
benchmark_cmd = None
|
||||||
benchmark_env = None
|
|
||||||
if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type:
|
if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type:
|
||||||
is_ctx = "CTX" in disagg_serving_type
|
is_ctx = "CTX" in disagg_serving_type
|
||||||
server_config = disagg_config[
|
server_config = disagg_config[
|
||||||
'ctx_server'] if is_ctx else disagg_config['gen_server']
|
'ctx_server'] if is_ctx else disagg_config['gen_server']
|
||||||
server_cmd = server_config.to_cmd(output_dir, numa_bind,
|
server_cmd = server_config.to_cmd(output_dir, numa_bind,
|
||||||
disagg_serving_type)
|
disagg_serving_type)
|
||||||
server_env = server_config.to_env()
|
|
||||||
if is_ctx:
|
if is_ctx:
|
||||||
ctx_server_cmd = server_cmd
|
ctx_server_cmd = server_cmd
|
||||||
ctx_server_env = server_env
|
|
||||||
else:
|
else:
|
||||||
gen_server_cmd = server_cmd
|
gen_server_cmd = server_cmd
|
||||||
gen_server_env = server_env
|
|
||||||
# Generate extra-llm-api-config.yml
|
# Generate extra-llm-api-config.yml
|
||||||
config_content = server_config.generate_extra_llm_api_config()
|
config_content = server_config.generate_extra_llm_api_config()
|
||||||
config_filename = f"extra-llm-api-config.{server_config.name}.yml"
|
config_filename = f"extra-llm-api-config.{server_config.name}.yml"
|
||||||
@ -1805,21 +1900,15 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
str(timeout), "-r",
|
str(timeout), "-r",
|
||||||
str(timeout)
|
str(timeout)
|
||||||
]
|
]
|
||||||
disagg_server_env = to_env_dict(disagg_config['server_env_var'])
|
|
||||||
elif "BENCHMARK" in disagg_serving_type:
|
elif "BENCHMARK" in disagg_serving_type:
|
||||||
# Generate benchmark command if this is the BENCHMARK server node
|
# Generate benchmark command if this is the BENCHMARK server node
|
||||||
benchmark_cmd = disagg_config['client'].to_cmd()
|
benchmark_cmd = disagg_config['client'].to_cmd()
|
||||||
benchmark_env = disagg_config['client'].to_env()
|
|
||||||
ctx_server_cmds.append(ctx_server_cmd)
|
ctx_server_cmds.append(ctx_server_cmd)
|
||||||
ctx_server_envs.append(ctx_server_env)
|
|
||||||
gen_server_cmds.append(gen_server_cmd)
|
gen_server_cmds.append(gen_server_cmd)
|
||||||
gen_server_envs.append(gen_server_env)
|
|
||||||
disagg_server_cmds.append(disagg_server_cmd)
|
disagg_server_cmds.append(disagg_server_cmd)
|
||||||
disagg_server_envs.append(disagg_server_env)
|
|
||||||
benchmark_cmds.append(benchmark_cmd)
|
benchmark_cmds.append(benchmark_cmd)
|
||||||
benchmark_envs.append(benchmark_env)
|
|
||||||
cmd_idx += 1
|
cmd_idx += 1
|
||||||
return ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs
|
return ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds
|
||||||
|
|
||||||
def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list:
|
def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list:
|
||||||
build_cmd = [
|
build_cmd = [
|
||||||
@ -2094,12 +2183,10 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
if is_aggr:
|
if is_aggr:
|
||||||
if not os.path.exists(perf_sanity_output_dir):
|
if not os.path.exists(perf_sanity_output_dir):
|
||||||
os.makedirs(perf_sanity_output_dir, exist_ok=True)
|
os.makedirs(perf_sanity_output_dir, exist_ok=True)
|
||||||
server_cmds, server_envs, client_cmds, client_envs, names = self.get_trtllm_aggr_commands(
|
server_cmds, client_cmds, names = self.get_trtllm_aggr_commands(
|
||||||
perf_sanity_output_dir)
|
perf_sanity_output_dir)
|
||||||
return PerfAggrScriptTestCmds(server_cmds=server_cmds,
|
return PerfAggrScriptTestCmds(server_cmds=server_cmds,
|
||||||
server_envs=server_envs,
|
|
||||||
client_cmds=client_cmds,
|
client_cmds=client_cmds,
|
||||||
client_envs=client_envs,
|
|
||||||
names=names,
|
names=names,
|
||||||
timeout=3600,
|
timeout=3600,
|
||||||
output_dir=perf_sanity_output_dir)
|
output_dir=perf_sanity_output_dir)
|
||||||
@ -2115,17 +2202,13 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
if is_multi_node_disagg:
|
if is_multi_node_disagg:
|
||||||
if not os.path.exists(perf_sanity_output_dir):
|
if not os.path.exists(perf_sanity_output_dir):
|
||||||
os.makedirs(perf_sanity_output_dir, exist_ok=True)
|
os.makedirs(perf_sanity_output_dir, exist_ok=True)
|
||||||
ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs = self.get_trtllm_multi_node_disagg_commands(
|
ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds = self.get_trtllm_multi_node_disagg_commands(
|
||||||
perf_sanity_output_dir)
|
perf_sanity_output_dir)
|
||||||
return PerfMultiNodeDisaggScriptTestCmds(
|
return PerfMultiNodeDisaggScriptTestCmds(
|
||||||
ctx_server_cmds=ctx_server_cmds,
|
ctx_server_cmds=ctx_server_cmds,
|
||||||
ctx_server_envs=ctx_server_envs,
|
|
||||||
gen_server_cmds=gen_server_cmds,
|
gen_server_cmds=gen_server_cmds,
|
||||||
gen_server_envs=gen_server_envs,
|
|
||||||
disagg_server_cmds=disagg_server_cmds,
|
disagg_server_cmds=disagg_server_cmds,
|
||||||
disagg_server_envs=disagg_server_envs,
|
|
||||||
benchmark_cmds=benchmark_cmds,
|
benchmark_cmds=benchmark_cmds,
|
||||||
benchmark_envs=benchmark_envs,
|
|
||||||
timeout=self._config.disagg_configs[0]['timeout'],
|
timeout=self._config.disagg_configs[0]['timeout'],
|
||||||
hostname=self._config.disagg_configs[0]['hostname'],
|
hostname=self._config.disagg_configs[0]['hostname'],
|
||||||
disagg_serving_type=self._config.disagg_configs[0]
|
disagg_serving_type=self._config.disagg_configs[0]
|
||||||
@ -2156,6 +2239,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
build_cmd = self.get_trtllm_bench_build_command(engine_dir)
|
build_cmd = self.get_trtllm_bench_build_command(engine_dir)
|
||||||
else:
|
else:
|
||||||
pytest.skip("only support trtllm-bench runtime for now")
|
pytest.skip("only support trtllm-bench runtime for now")
|
||||||
|
|
||||||
# Construct prepare synthetic data command
|
# Construct prepare synthetic data command
|
||||||
data_cmds = []
|
data_cmds = []
|
||||||
|
|
||||||
@ -2293,32 +2377,24 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
#print info to separate cases
|
#print info to separate cases
|
||||||
self._current_cmd_idx = 0
|
self._current_cmd_idx = 0
|
||||||
metrics = self._get_metrics()
|
metrics = self._get_metrics()
|
||||||
|
commands = self.get_commands()
|
||||||
outputs = {}
|
outputs = {}
|
||||||
result_states = {}
|
result_states = {}
|
||||||
errors = []
|
errors = []
|
||||||
|
|
||||||
def add_myelin_time_pass_to(input_env):
|
# Only trtllm-bench needs to prepare dataset first.
|
||||||
time_pass_flag = r" -time_pass=on"
|
|
||||||
old_myelin_env = input_env.get("__LUNOWUD", "")
|
|
||||||
if time_pass_flag not in old_myelin_env:
|
|
||||||
input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
|
|
||||||
return old_myelin_env
|
|
||||||
|
|
||||||
old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
|
|
||||||
if self._config.runtime == 'bench':
|
if self._config.runtime == 'bench':
|
||||||
#prepare dataset first for trtllm-bench
|
|
||||||
print_info(f"Running command for generating dataset")
|
print_info(f"Running command for generating dataset")
|
||||||
outputs = self.run_ex("prepare_dataset",
|
outputs = self.run_ex(commands=commands,
|
||||||
None,
|
cmd_idx=self._current_cmd_idx,
|
||||||
llm_venv,
|
full_test_name="prepare_dataset",
|
||||||
gpu_clock_lock,
|
metric_type=None,
|
||||||
session_data_writer,
|
venv=llm_venv,
|
||||||
output_dir,
|
gpu_clock_lock=gpu_clock_lock,
|
||||||
|
session_data_writer=session_data_writer,
|
||||||
|
output_dir=output_dir,
|
||||||
outputs=outputs,
|
outputs=outputs,
|
||||||
original_test_name="prepare_dataset",
|
original_test_name="prepare_dataset")
|
||||||
cmd_idx=self._current_cmd_idx)
|
|
||||||
|
|
||||||
# Save the result state.
|
|
||||||
result_state = self.get_result_state()
|
result_state = self.get_result_state()
|
||||||
result_states[self._current_cmd_idx] = result_state
|
result_states[self._current_cmd_idx] = result_state
|
||||||
if result_state != "valid":
|
if result_state != "valid":
|
||||||
@ -2349,15 +2425,16 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
# Run the command or reuse the existing output logs.
|
# Run the command or reuse the existing output logs.
|
||||||
print_info(f"Running command for {metric.metric_name}")
|
print_info(f"Running command for {metric.metric_name}")
|
||||||
outputs = self.run_ex(
|
outputs = self.run_ex(
|
||||||
metric.metric_name,
|
commands=commands,
|
||||||
metric.metric_type,
|
cmd_idx=self._current_cmd_idx,
|
||||||
llm_venv,
|
full_test_name=metric.metric_name,
|
||||||
gpu_clock_lock,
|
metric_type=metric.metric_type,
|
||||||
session_data_writer,
|
venv=llm_venv,
|
||||||
output_dir,
|
gpu_clock_lock=gpu_clock_lock,
|
||||||
|
session_data_writer=session_data_writer,
|
||||||
|
output_dir=output_dir,
|
||||||
outputs=outputs,
|
outputs=outputs,
|
||||||
original_test_name=metric.original_test_name,
|
original_test_name=metric.original_test_name)
|
||||||
cmd_idx=self._current_cmd_idx)
|
|
||||||
|
|
||||||
# Save the result state.
|
# Save the result state.
|
||||||
result_state = self.get_result_state()
|
result_state = self.get_result_state()
|
||||||
@ -2373,6 +2450,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
# Clean up engine dir after use.
|
# Clean up engine dir after use.
|
||||||
shutil.rmtree(self._get_engine_dir(), ignore_errors=True)
|
shutil.rmtree(self._get_engine_dir(), ignore_errors=True)
|
||||||
|
|
||||||
|
def add_myelin_time_pass_to(input_env):
|
||||||
|
time_pass_flag = r" -time_pass=on"
|
||||||
|
old_myelin_env = input_env.get("__LUNOWUD", "")
|
||||||
|
if time_pass_flag not in old_myelin_env:
|
||||||
|
input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
|
||||||
|
return old_myelin_env
|
||||||
|
|
||||||
|
old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
|
||||||
llm_venv._new_env["__LUNOWUD"] = old_llm_venv
|
llm_venv._new_env["__LUNOWUD"] = old_llm_venv
|
||||||
|
|
||||||
# Check if any commands failed.
|
# Check if any commands failed.
|
||||||
@ -2393,14 +2478,19 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
Upload the test results and baseline to database.
|
Upload the test results and baseline to database.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def prefix_server_config_dict(config_dict: dict,
|
def add_prefix(key: str, prefix_name: str) -> dict:
|
||||||
prefix_name: str) -> dict:
|
type_prefix = key[0:2] # 'l_', 's_', 'b_', 'd_'
|
||||||
prefixed_dict = {}
|
rest = key[2:]
|
||||||
for key, value in config_dict.items():
|
return f"{type_prefix}{prefix_name}_{rest}"
|
||||||
type_prefix = key[0:2] # 'l_', 's_', 'b_', 'd_'
|
|
||||||
rest = key[2:]
|
def add_list_prefix(config_list: List, prefix_name: str) -> List:
|
||||||
prefixed_dict[f"{type_prefix}{prefix_name}_{rest}"] = value
|
return [add_prefix(key, prefix_name) for key in config_list]
|
||||||
return prefixed_dict
|
|
||||||
|
def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict:
|
||||||
|
return {
|
||||||
|
add_prefix(key, prefix_name): value
|
||||||
|
for key, value in config_dict.items()
|
||||||
|
}
|
||||||
|
|
||||||
match_keys = []
|
match_keys = []
|
||||||
# Only aggr_server and multi_node_disagg_server will upload.
|
# Only aggr_server and multi_node_disagg_server will upload.
|
||||||
@ -2441,12 +2531,12 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
new_data_dict[cmd_idx] = new_data
|
new_data_dict[cmd_idx] = new_data
|
||||||
cmd_idx += 1
|
cmd_idx += 1
|
||||||
if not match_keys:
|
if not match_keys:
|
||||||
|
match_keys.append("s_runtime")
|
||||||
if server_config.match_mode == "scenario":
|
if server_config.match_mode == "scenario":
|
||||||
match_keys = SCENARIO_MATCH_FIELDS.copy()
|
match_keys = SCENARIO_MATCH_FIELDS.copy()
|
||||||
else:
|
else:
|
||||||
match_keys.append("s_runtime")
|
match_keys.extend(server_config.to_match_keys())
|
||||||
match_keys.extend(server_config_dict.keys())
|
match_keys.extend(client_config.to_match_keys())
|
||||||
match_keys.extend(client_config_dict.keys())
|
|
||||||
|
|
||||||
elif self._config.runtime == "multi_node_disagg_server":
|
elif self._config.runtime == "multi_node_disagg_server":
|
||||||
if self._config.disagg_configs[0][
|
if self._config.disagg_configs[0][
|
||||||
@ -2472,27 +2562,28 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
)
|
)
|
||||||
gen_server_config_dict = disagg_config['gen_server'].to_db_data(
|
gen_server_config_dict = disagg_config['gen_server'].to_db_data(
|
||||||
)
|
)
|
||||||
ctx_server_config_dict = prefix_server_config_dict(
|
|
||||||
ctx_server_config_dict, 'ctx')
|
|
||||||
gen_server_config_dict = prefix_server_config_dict(
|
|
||||||
gen_server_config_dict, 'gen')
|
|
||||||
client_config_dict = disagg_config['client'].to_db_data()
|
client_config_dict = disagg_config['client'].to_db_data()
|
||||||
# Build new_data
|
ctx_server_config_dict = add_dict_prefix(
|
||||||
|
ctx_server_config_dict, 'ctx')
|
||||||
|
gen_server_config_dict = add_dict_prefix(
|
||||||
|
gen_server_config_dict, 'gen')
|
||||||
|
|
||||||
|
hardware = disagg_config.get('hardware', {})
|
||||||
|
num_ctx_servers = hardware.get('num_ctx_servers', 0)
|
||||||
|
num_gen_servers = hardware.get('num_gen_servers', 0)
|
||||||
new_data = {
|
new_data = {
|
||||||
"s_runtime": "multi_node_disagg_server",
|
"s_runtime": "multi_node_disagg_server",
|
||||||
"s_server_env_var": disagg_config['server_env_var']
|
"s_benchmark_mode": disagg_config['mode'],
|
||||||
|
"s_server_env_var": disagg_config['server_env_var'],
|
||||||
|
"l_num_ctx_servers": num_ctx_servers,
|
||||||
|
"l_num_gen_servers": num_gen_servers
|
||||||
}
|
}
|
||||||
new_data.update(job_config)
|
new_data.update(job_config)
|
||||||
new_data.update(ctx_server_config_dict)
|
if num_ctx_servers > 0:
|
||||||
new_data.update(gen_server_config_dict)
|
new_data.update(ctx_server_config_dict)
|
||||||
|
if num_gen_servers > 0:
|
||||||
|
new_data.update(gen_server_config_dict)
|
||||||
new_data.update(client_config_dict)
|
new_data.update(client_config_dict)
|
||||||
# Add hardware information
|
|
||||||
hardware = disagg_config.get('hardware', {})
|
|
||||||
new_data["l_num_ctx_servers"] = hardware.get(
|
|
||||||
'num_ctx_servers', 0)
|
|
||||||
new_data["l_num_gen_servers"] = hardware.get(
|
|
||||||
'num_gen_servers', 0)
|
|
||||||
# Add metrics from test results
|
|
||||||
for metric_type in AGGR_SERVER_METRICS:
|
for metric_type in AGGR_SERVER_METRICS:
|
||||||
new_data[
|
new_data[
|
||||||
f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[
|
f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[
|
||||||
@ -2503,9 +2594,17 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
if not match_keys:
|
if not match_keys:
|
||||||
match_keys.extend(
|
match_keys.extend(
|
||||||
["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"])
|
["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"])
|
||||||
match_keys.extend(ctx_server_config_dict.keys())
|
if num_ctx_servers > 0:
|
||||||
match_keys.extend(gen_server_config_dict.keys())
|
match_keys.extend(
|
||||||
match_keys.extend(client_config_dict.keys())
|
add_list_prefix(
|
||||||
|
disagg_config['ctx_server'].to_match_keys(),
|
||||||
|
'ctx'))
|
||||||
|
if num_gen_servers > 0:
|
||||||
|
match_keys.extend(
|
||||||
|
add_list_prefix(
|
||||||
|
disagg_config['gen_server'].to_match_keys(),
|
||||||
|
'gen'))
|
||||||
|
match_keys.extend(disagg_config['client'].to_match_keys())
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -2519,7 +2618,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
|
|||||||
if is_post_merge:
|
if is_post_merge:
|
||||||
# Prepare new baseline data for post-merge
|
# Prepare new baseline data for post-merge
|
||||||
new_baseline_data_dict = prepare_baseline_data(
|
new_baseline_data_dict = prepare_baseline_data(
|
||||||
history_baseline_dict, history_data_dict, new_data_dict)
|
history_data_dict, new_data_dict)
|
||||||
else:
|
else:
|
||||||
# Pre-merge does not need to upload baseline data
|
# Pre-merge does not need to upload baseline data
|
||||||
new_baseline_data_dict = None
|
new_baseline_data_dict = None
|
||||||
|
|||||||
@ -245,9 +245,7 @@ class PerfBenchScriptTestCmds(NamedTuple):
|
|||||||
|
|
||||||
class PerfAggrScriptTestCmds(NamedTuple):
|
class PerfAggrScriptTestCmds(NamedTuple):
|
||||||
server_cmds: List[List[str]]
|
server_cmds: List[List[str]]
|
||||||
server_envs: List[Dict[str, str]]
|
|
||||||
client_cmds: List[List[str]]
|
client_cmds: List[List[str]]
|
||||||
client_envs: List[Dict[str, str]]
|
|
||||||
names: List[str]
|
names: List[str]
|
||||||
timeout: int
|
timeout: int
|
||||||
output_dir: str
|
output_dir: str
|
||||||
@ -345,13 +343,9 @@ class PerfDisaggScriptTestCmds(NamedTuple):
|
|||||||
|
|
||||||
class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
|
class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
|
||||||
ctx_server_cmds: List[List[str]]
|
ctx_server_cmds: List[List[str]]
|
||||||
ctx_server_envs: List[Dict[str, str]]
|
|
||||||
gen_server_cmds: List[List[str]]
|
gen_server_cmds: List[List[str]]
|
||||||
gen_server_envs: List[Dict[str, str]]
|
|
||||||
disagg_server_cmds: List[List[str]]
|
disagg_server_cmds: List[List[str]]
|
||||||
disagg_server_envs: List[Dict[str, str]]
|
|
||||||
benchmark_cmds: List[List[str]]
|
benchmark_cmds: List[List[str]]
|
||||||
benchmark_envs: List[Dict[str, str]]
|
|
||||||
timeout: int
|
timeout: int
|
||||||
hostname: str
|
hostname: str
|
||||||
disagg_serving_type: str
|
disagg_serving_type: str
|
||||||
@ -694,23 +688,21 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def run_ex(self,
|
def run_ex(self,
|
||||||
|
commands,
|
||||||
full_test_name: str,
|
full_test_name: str,
|
||||||
metric_type: PerfMetricType,
|
metric_type: PerfMetricType,
|
||||||
venv: Optional[PythonVenvRunnerImpl],
|
venv: Optional[PythonVenvRunnerImpl],
|
||||||
gpu_clock_lock: GPUClockLock,
|
gpu_clock_lock: GPUClockLock,
|
||||||
session_data_writer: SessionDataWriter,
|
session_data_writer: SessionDataWriter,
|
||||||
output_dir: str,
|
output_dir: str,
|
||||||
|
cmd_idx: int = 0,
|
||||||
outputs: Dict[int, str] = {},
|
outputs: Dict[int, str] = {},
|
||||||
original_test_name: str = None,
|
original_test_name: str = None,
|
||||||
cmd_idx: int = 0,
|
|
||||||
**kwargs) -> List[str]:
|
**kwargs) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Run the commands and write the results to the output csv and/or yaml files.
|
Run the commands and write the results to the output csv and/or yaml files.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Get the commands.
|
|
||||||
commands = self.get_commands()
|
|
||||||
|
|
||||||
# Avoid modifying argument directly
|
# Avoid modifying argument directly
|
||||||
outputs = outputs.copy()
|
outputs = outputs.copy()
|
||||||
|
|
||||||
@ -723,7 +715,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
|||||||
|
|
||||||
cmd_str = commands.get_cmd_str(cmd_idx)
|
cmd_str = commands.get_cmd_str(cmd_idx)
|
||||||
is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str
|
is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str
|
||||||
|
|
||||||
is_perf_sanity_test = "perf_sanity" in full_test_name
|
is_perf_sanity_test = "perf_sanity" in full_test_name
|
||||||
|
|
||||||
is_disagg_server = False
|
is_disagg_server = False
|
||||||
@ -804,7 +795,8 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
|||||||
outputs.pop(cmd_idx)
|
outputs.pop(cmd_idx)
|
||||||
elif is_disagg_server:
|
elif is_disagg_server:
|
||||||
print_info(
|
print_info(
|
||||||
f"skip writing perf result when running disagg's server.")
|
f"skip writing perf result when running disagg's worker or server."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self._perf_result = self.get_perf_result(outputs)
|
self._perf_result = self.get_perf_result(outputs)
|
||||||
|
|
||||||
|
|||||||
@ -15,9 +15,9 @@ l0_dgx_b200_perf_sanity:
|
|||||||
backend: pytorch
|
backend: pytorch
|
||||||
orchestrator: mpi
|
orchestrator: mpi
|
||||||
tests:
|
tests:
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
||||||
|
|
||||||
- condition:
|
- condition:
|
||||||
ranges:
|
ranges:
|
||||||
@ -34,8 +34,8 @@ l0_dgx_b200_perf_sanity:
|
|||||||
backend: pytorch
|
backend: pytorch
|
||||||
orchestrator: mpi
|
orchestrator: mpi
|
||||||
tests:
|
tests:
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
|
||||||
|
|||||||
@ -16,9 +16,9 @@ l0_dgx_b300_perf_sanity:
|
|||||||
backend: pytorch
|
backend: pytorch
|
||||||
orchestrator: mpi
|
orchestrator: mpi
|
||||||
tests:
|
tests:
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
||||||
|
|
||||||
- condition:
|
- condition:
|
||||||
ranges:
|
ranges:
|
||||||
@ -36,6 +36,6 @@ l0_dgx_b300_perf_sanity:
|
|||||||
backend: pytorch
|
backend: pytorch
|
||||||
orchestrator: mpi
|
orchestrator: mpi
|
||||||
tests:
|
tests:
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
||||||
|
|||||||
@ -14,6 +14,6 @@ l0_gb200_multi_gpus_perf_sanity:
|
|||||||
stage: post_merge
|
stage: post_merge
|
||||||
backend: pytorch
|
backend: pytorch
|
||||||
tests:
|
tests:
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_1k1k]
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k]
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_1k1k]
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k]
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
version: 0.0.1
|
version: 0.0.1
|
||||||
l0_gb200_multi_nodes_perf_sanity:
|
l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001:
|
||||||
- condition:
|
- condition:
|
||||||
ranges:
|
ranges:
|
||||||
# 2 nodes with each node has 4 GPUs
|
# 2 nodes with each node has 4 GPUs
|
||||||
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_perf_sanity:
|
|||||||
stage: post_merge
|
stage: post_merge
|
||||||
backend: pytorch
|
backend: pytorch
|
||||||
tests:
|
tests:
|
||||||
- perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_nodes-r1_fp4_v2_dep8_mtp1]
|
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1]
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
version: 0.0.1
|
||||||
|
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
|
||||||
|
- condition:
|
||||||
|
ranges:
|
||||||
|
# 3 nodes with each node has 4 GPUs
|
||||||
|
system_gpu_count:
|
||||||
|
gte: 12
|
||||||
|
lte: 12
|
||||||
|
wildcards:
|
||||||
|
gpu:
|
||||||
|
- '*gb200*'
|
||||||
|
terms:
|
||||||
|
stage: post_merge
|
||||||
|
backend: pytorch
|
||||||
|
tests:
|
||||||
|
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (180)
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
version: 0.0.1
|
||||||
|
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001:
|
||||||
|
- condition:
|
||||||
|
ranges:
|
||||||
|
# 6 nodes with each node has 4 GPUs
|
||||||
|
system_gpu_count:
|
||||||
|
gte: 24
|
||||||
|
lte: 24
|
||||||
|
wildcards:
|
||||||
|
gpu:
|
||||||
|
- '*gb200*'
|
||||||
|
terms:
|
||||||
|
stage: post_merge
|
||||||
|
backend: pytorch
|
||||||
|
tests:
|
||||||
|
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
version: 0.0.1
|
||||||
|
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002:
|
||||||
|
- condition:
|
||||||
|
ranges:
|
||||||
|
# 6 nodes with each node has 4 GPUs
|
||||||
|
system_gpu_count:
|
||||||
|
gte: 24
|
||||||
|
lte: 24
|
||||||
|
wildcards:
|
||||||
|
gpu:
|
||||||
|
- '*gb200*'
|
||||||
|
terms:
|
||||||
|
stage: post_merge
|
||||||
|
backend: pytorch
|
||||||
|
tests:
|
||||||
|
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (180)
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
version: 0.0.1
|
||||||
|
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
|
||||||
|
- condition:
|
||||||
|
ranges:
|
||||||
|
# 8 nodes with each node has 4 GPUs
|
||||||
|
system_gpu_count:
|
||||||
|
gte: 32
|
||||||
|
lte: 32
|
||||||
|
wildcards:
|
||||||
|
gpu:
|
||||||
|
- '*gb200*'
|
||||||
|
terms:
|
||||||
|
stage: post_merge
|
||||||
|
backend: pytorch
|
||||||
|
tests:
|
||||||
|
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
|
||||||
@ -4,106 +4,31 @@ Performance sanity testing scripts for TensorRT-LLM with configuration-driven te
|
|||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
- Run performance sanity benchmarks across multiple model configurations
|
- Run performance sanity benchmarks across multiple model configs
|
||||||
- Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated
|
- Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated
|
||||||
- Manage test cases through YAML configuration files
|
- Manage test cases through YAML config files
|
||||||
- Automated resource calculation and job submission via SLURM
|
- Automated resource calculation and job submission via SLURM
|
||||||
|
|
||||||
## Configuration File Types
|
## Configuration File Types
|
||||||
|
|
||||||
There are three types of YAML configuration files for different deployment architectures:
|
There are three types of YAML config files for different deployment architectures.
|
||||||
|
Aggregated config files are in [`tests/scripts/perf-sanity`](./).
|
||||||
|
Disaggregated config files are in [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf).
|
||||||
|
|
||||||
### 1. Single-Node Aggregated Test Configuration
|
### 1. Single-Node Aggregated Test Configuration
|
||||||
|
|
||||||
**File Example**: `l0_dgx_b200.yaml`
|
**File Example**: `deepseek_r1_fp4_v2_grace_blackwell.yaml`
|
||||||
|
|
||||||
**Use Case**: Single-node performance tests on a single server with multiple GPUs.
|
**Use Case**: Single-node performance tests on a single server with multiple GPUs.
|
||||||
|
|
||||||
**Structure**:
|
|
||||||
```yaml
|
|
||||||
server_configs:
|
|
||||||
- name: "r1_fp8_dep8_mtp1_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp8"
|
|
||||||
gpus: 8
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 8
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 512
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attention_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
batching_wait_iters: 0
|
|
||||||
enable_balance: true
|
|
||||||
timeout_iters: 60
|
|
||||||
moe_config:
|
|
||||||
backend: 'DEEPGEMM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 512
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 1
|
|
||||||
client_configs:
|
|
||||||
- name: "con4096_iter10_1k1k"
|
|
||||||
concurrency: 4096
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.8
|
|
||||||
backend: "openai"
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
### 2. Multi-Node Aggregated Test Configuration
|
### 2. Multi-Node Aggregated Test Configuration
|
||||||
|
|
||||||
**File Example**: `l0_gb200_multi_nodes.yaml`
|
**File Example**: `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml`
|
||||||
|
|
||||||
**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
|
**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
|
||||||
|
|
||||||
**Structure**:
|
### 3. Multi-Node Disaggregated Test Configuration
|
||||||
```yaml
|
|
||||||
# Hardware Config
|
|
||||||
hardware:
|
|
||||||
gpus_per_node: 4
|
|
||||||
gpus_per_server: 8
|
|
||||||
|
|
||||||
server_configs:
|
**File Example**: `deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml`
|
||||||
- name: "r1_fp4_v2_dep8_mtp1"
|
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
**Use Case**: Disaggregated architecture where model runs across multiple nodes with separate context (prefill) and generation (decode) servers.
|
||||||
gpus: 8
|
|
||||||
gpus_per_node: 4
|
|
||||||
trust_remote_code: true
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 8
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 512
|
|
||||||
max_num_tokens: 2112
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
batching_wait_iters: 0
|
|
||||||
enable_balance: true
|
|
||||||
timeout_iters: 60
|
|
||||||
moe_config:
|
|
||||||
backend: 'CUTLASS'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 512
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.5
|
|
||||||
client_configs:
|
|
||||||
- name: "con32_iter12_1k1k"
|
|
||||||
concurrency: 32
|
|
||||||
iterations: 12
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.8
|
|
||||||
backend: "openai"
|
|
||||||
```
|
|
||||||
|
|||||||
@ -1,13 +1,13 @@
|
|||||||
# Hardware Config
|
metadata:
|
||||||
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
- GB300
|
||||||
hardware:
|
hardware:
|
||||||
gpus_per_node: 4
|
gpus_per_node: 4
|
||||||
gpus_per_server: 8
|
|
||||||
|
|
||||||
server_configs:
|
server_configs:
|
||||||
- name: "r1_fp4_v2_dep8_mtp1"
|
- name: "r1_fp4_v2_dep8_mtp1"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 8
|
|
||||||
gpus_per_node: 4
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
tensor_parallel_size: 8
|
tensor_parallel_size: 8
|
||||||
moe_expert_parallel_size: 8
|
moe_expert_parallel_size: 8
|
||||||
@ -37,11 +37,8 @@ server_configs:
|
|||||||
osl: 1024
|
osl: 1024
|
||||||
random_range_ratio: 0.2
|
random_range_ratio: 0.2
|
||||||
backend: "openai"
|
backend: "openai"
|
||||||
|
|
||||||
- name: "r1_fp4_v2_tep8_mtp3"
|
- name: "r1_fp4_v2_tep8_mtp3"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 8
|
|
||||||
gpus_per_node: 4
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
tensor_parallel_size: 8
|
tensor_parallel_size: 8
|
||||||
moe_expert_parallel_size: 8
|
moe_expert_parallel_size: 8
|
||||||
99
tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
Normal file
99
tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
|
supported_gpus:
|
||||||
|
- B200
|
||||||
|
- B300
|
||||||
|
server_configs:
|
||||||
|
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
|
||||||
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 512
|
||||||
|
max_num_tokens: 8192
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: true
|
||||||
|
attention_dp_config:
|
||||||
|
batching_wait_iters: 0
|
||||||
|
enable_balance: true
|
||||||
|
timeout_iters: 60
|
||||||
|
moe_config:
|
||||||
|
backend: 'CUTLASS'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 512
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: 'MTP'
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
client_configs:
|
||||||
|
- name: "con2048_iter10_1k1k"
|
||||||
|
concurrency: 2048
|
||||||
|
iterations: 10
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
|
|
||||||
|
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
||||||
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 32
|
||||||
|
max_num_tokens: 8192
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: false
|
||||||
|
moe_config:
|
||||||
|
backend: 'TRTLLM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 32
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: 'MTP'
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
client_configs:
|
||||||
|
- name: "con32_iter10_1k1k"
|
||||||
|
concurrency: 32
|
||||||
|
iterations: 10
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
|
|
||||||
|
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
||||||
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 4
|
||||||
|
max_num_tokens: 8192
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: false
|
||||||
|
moe_config:
|
||||||
|
backend: 'TRTLLM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 4
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: 'MTP'
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
client_configs:
|
||||||
|
- name: "con4_iter10_1k1k"
|
||||||
|
concurrency: 4
|
||||||
|
iterations: 10
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
@ -1,8 +1,12 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek_r1_0528_fp4_v2
|
||||||
|
supported_gpus:
|
||||||
|
- GB200
|
||||||
|
- GB300
|
||||||
server_configs:
|
server_configs:
|
||||||
# 1k1k configs
|
# 1k1k configs
|
||||||
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
|
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 4
|
moe_expert_parallel_size: 4
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -37,7 +41,6 @@ server_configs:
|
|||||||
|
|
||||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 4
|
moe_expert_parallel_size: 4
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -68,7 +71,6 @@ server_configs:
|
|||||||
|
|
||||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 1
|
moe_expert_parallel_size: 1
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -100,7 +102,6 @@ server_configs:
|
|||||||
# 8k1k configs
|
# 8k1k configs
|
||||||
- name: "r1_fp4_v2_dep4_mtp1_8k1k"
|
- name: "r1_fp4_v2_dep4_mtp1_8k1k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 4
|
moe_expert_parallel_size: 4
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -135,7 +136,6 @@ server_configs:
|
|||||||
|
|
||||||
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
|
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 4
|
moe_expert_parallel_size: 4
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -166,7 +166,6 @@ server_configs:
|
|||||||
|
|
||||||
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 1
|
moe_expert_parallel_size: 1
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -198,7 +197,6 @@ server_configs:
|
|||||||
# 1k8k configs
|
# 1k8k configs
|
||||||
- name: "r1_fp4_v2_dep4_mtp1_1k8k"
|
- name: "r1_fp4_v2_dep4_mtp1_1k8k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 4
|
moe_expert_parallel_size: 4
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -233,7 +231,6 @@ server_configs:
|
|||||||
|
|
||||||
- name: "r1_fp4_v2_tep4_mtp3_1k8k"
|
- name: "r1_fp4_v2_tep4_mtp3_1k8k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 4
|
moe_expert_parallel_size: 4
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
@ -264,7 +261,6 @@ server_configs:
|
|||||||
|
|
||||||
- name: "r1_fp4_v2_tp4_mtp3_1k8k"
|
- name: "r1_fp4_v2_tp4_mtp3_1k8k"
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
model_name: "deepseek_r1_0528_fp4_v2"
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
tensor_parallel_size: 4
|
||||||
moe_expert_parallel_size: 1
|
moe_expert_parallel_size: 1
|
||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
99
tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
Normal file
99
tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: deepseek_r1_0528_fp8
|
||||||
|
supported_gpus:
|
||||||
|
- B200
|
||||||
|
- B300
|
||||||
|
server_configs:
|
||||||
|
- name: "r1_fp8_dep8_mtp1_1k1k"
|
||||||
|
model_name: "deepseek_r1_0528_fp8"
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 512
|
||||||
|
max_num_tokens: 8192
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: true
|
||||||
|
attention_dp_config:
|
||||||
|
batching_wait_iters: 0
|
||||||
|
enable_balance: true
|
||||||
|
timeout_iters: 60
|
||||||
|
moe_config:
|
||||||
|
backend: 'DEEPGEMM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 512
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: 'MTP'
|
||||||
|
num_nextn_predict_layers: 1
|
||||||
|
client_configs:
|
||||||
|
- name: "con4096_iter10_1k1k"
|
||||||
|
concurrency: 4096
|
||||||
|
iterations: 10
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
|
|
||||||
|
- name: "r1_fp8_tep8_mtp3_1k1k"
|
||||||
|
model_name: "deepseek_r1_0528_fp8"
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 8
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 64
|
||||||
|
max_num_tokens: 8192
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: false
|
||||||
|
moe_config:
|
||||||
|
backend: 'DEEPGEMM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 64
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: 'MTP'
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
client_configs:
|
||||||
|
- name: "con64_iter10_1k1k"
|
||||||
|
concurrency: 64
|
||||||
|
iterations: 10
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
|
|
||||||
|
- name: "r1_fp8_tp8_mtp3_1k1k"
|
||||||
|
model_name: "deepseek_r1_0528_fp8"
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 8
|
||||||
|
max_num_tokens: 8192
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: false
|
||||||
|
moe_config:
|
||||||
|
backend: 'TRTLLM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 8
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: 'MTP'
|
||||||
|
num_nextn_predict_layers: 3
|
||||||
|
client_configs:
|
||||||
|
- name: "con8_iter10_1k1k"
|
||||||
|
concurrency: 8
|
||||||
|
iterations: 10
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
101
tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml
Normal file
101
tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
metadata:
|
||||||
|
model_name: gpt_oss_120b_fp4
|
||||||
|
supported_gpus:
|
||||||
|
- B200
|
||||||
|
- B300
|
||||||
|
server_configs:
|
||||||
|
- name: "gpt_oss_fp4_dep2_1k1k"
|
||||||
|
model_name: "gpt_oss_120b_fp4"
|
||||||
|
tensor_parallel_size: 2
|
||||||
|
moe_expert_parallel_size: 2
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1024
|
||||||
|
max_num_tokens: 20000
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: true
|
||||||
|
attention_dp_config:
|
||||||
|
enable_balance: true
|
||||||
|
moe_config:
|
||||||
|
backend: 'TRTLLM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 1024
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
stream_interval: 20
|
||||||
|
client_configs:
|
||||||
|
- name: "con2048_iter5_1k1k"
|
||||||
|
concurrency: 2048
|
||||||
|
iterations: 5
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
|
|
||||||
|
- name: "gpt_oss_fp4_dep4_1k1k"
|
||||||
|
model_name: "gpt_oss_120b_fp4"
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 4
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 512
|
||||||
|
max_num_tokens: 20000
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: true
|
||||||
|
attention_dp_config:
|
||||||
|
enable_balance: true
|
||||||
|
moe_config:
|
||||||
|
backend: 'TRTLLM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 512
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
stream_interval: 20
|
||||||
|
client_configs:
|
||||||
|
- name: "con2048_iter5_1k1k"
|
||||||
|
concurrency: 2048
|
||||||
|
iterations: 5
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
|
|
||||||
|
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
|
||||||
|
model_name: "gpt_oss_120b_fp4"
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
moe_expert_parallel_size: 1
|
||||||
|
pipeline_parallel_size: 1
|
||||||
|
max_batch_size: 1
|
||||||
|
max_num_tokens: 20000
|
||||||
|
attn_backend: "TRTLLM"
|
||||||
|
enable_attention_dp: false
|
||||||
|
moe_config:
|
||||||
|
backend: 'TRTLLM'
|
||||||
|
cuda_graph_config:
|
||||||
|
enable_padding: true
|
||||||
|
max_batch_size: 1
|
||||||
|
kv_cache_config:
|
||||||
|
dtype: 'fp8'
|
||||||
|
enable_block_reuse: false
|
||||||
|
free_gpu_memory_fraction: 0.8
|
||||||
|
speculative_config:
|
||||||
|
decoding_type: 'Eagle'
|
||||||
|
eagle3_layers_to_capture: [-1]
|
||||||
|
max_draft_len: 3
|
||||||
|
speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
|
||||||
|
stream_interval: 20
|
||||||
|
num_postprocess_workers: 4
|
||||||
|
client_configs:
|
||||||
|
- name: "con1_iter32_1k1k"
|
||||||
|
concurrency: 1
|
||||||
|
iterations: 32
|
||||||
|
isl: 1024
|
||||||
|
osl: 1024
|
||||||
|
random_range_ratio: 0.2
|
||||||
|
backend: "openai"
|
||||||
@ -1,293 +0,0 @@
|
|||||||
server_configs:
|
|
||||||
- name: "r1_fp8_dep8_mtp1_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp8"
|
|
||||||
gpus: 8
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 8
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 512
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
batching_wait_iters: 0
|
|
||||||
enable_balance: true
|
|
||||||
timeout_iters: 60
|
|
||||||
moe_config:
|
|
||||||
backend: 'DEEPGEMM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 512
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 1
|
|
||||||
client_configs:
|
|
||||||
- name: "con4096_iter10_1k1k"
|
|
||||||
concurrency: 4096
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp8_tep8_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp8"
|
|
||||||
gpus: 8
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 8
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 64
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'DEEPGEMM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 64
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con64_iter10_1k1k"
|
|
||||||
concurrency: 64
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp8_tp8_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp8"
|
|
||||||
gpus: 8
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 1
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 8
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 8
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con8_iter10_1k1k"
|
|
||||||
concurrency: 8
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 4
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 512
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
batching_wait_iters: 0
|
|
||||||
enable_balance: true
|
|
||||||
timeout_iters: 60
|
|
||||||
moe_config:
|
|
||||||
backend: 'CUTLASS'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 512
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 1
|
|
||||||
client_configs:
|
|
||||||
- name: "con2048_iter10_1k1k"
|
|
||||||
concurrency: 2048
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 4
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 32
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 32
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con32_iter10_1k1k"
|
|
||||||
concurrency: 32
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 1
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 4
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 4
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con4_iter10_1k1k"
|
|
||||||
concurrency: 4
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "gpt_oss_fp4_dep2_1k1k"
|
|
||||||
model_name: "gpt_oss_120b_fp4"
|
|
||||||
gpus: 2
|
|
||||||
tensor_parallel_size: 2
|
|
||||||
moe_expert_parallel_size: 2
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 1024
|
|
||||||
max_num_tokens: 20000
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
enable_balance: true
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 1024
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
num_postprocess_workers: 4
|
|
||||||
stream_interval: 20
|
|
||||||
client_configs:
|
|
||||||
- name: "con2048_iter5_1k1k"
|
|
||||||
concurrency: 2048
|
|
||||||
iterations: 5
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "gpt_oss_fp4_dep4_1k1k"
|
|
||||||
model_name: "gpt_oss_120b_fp4"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 4
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 512
|
|
||||||
max_num_tokens: 20000
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
enable_balance: true
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 512
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
num_postprocess_workers: 4
|
|
||||||
stream_interval: 20
|
|
||||||
client_configs:
|
|
||||||
- name: "con2048_iter5_1k1k"
|
|
||||||
concurrency: 2048
|
|
||||||
iterations: 5
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
|
|
||||||
model_name: "gpt_oss_120b_fp4"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 1
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 1
|
|
||||||
max_num_tokens: 20000
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 1
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'Eagle'
|
|
||||||
eagle3_layers_to_capture: [-1]
|
|
||||||
max_draft_len: 3
|
|
||||||
speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
|
|
||||||
stream_interval: 20
|
|
||||||
num_postprocess_workers: 4
|
|
||||||
client_configs:
|
|
||||||
- name: "con1_iter32_1k1k"
|
|
||||||
concurrency: 1
|
|
||||||
iterations: 32
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
@ -1,194 +0,0 @@
|
|||||||
server_configs:
|
|
||||||
- name: "r1_fp8_dep8_mtp1_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp8"
|
|
||||||
gpus: 8
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 8
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 512
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
batching_wait_iters: 0
|
|
||||||
enable_balance: true
|
|
||||||
timeout_iters: 60
|
|
||||||
moe_config:
|
|
||||||
backend: 'DEEPGEMM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 512
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 1
|
|
||||||
client_configs:
|
|
||||||
- name: "con4096_iter10_1k1k"
|
|
||||||
concurrency: 4096
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp8_tep8_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp8"
|
|
||||||
gpus: 8
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 8
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 64
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'DEEPGEMM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 64
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con64_iter10_1k1k"
|
|
||||||
concurrency: 64
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp8_tp8_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp8"
|
|
||||||
gpus: 8
|
|
||||||
tensor_parallel_size: 8
|
|
||||||
moe_expert_parallel_size: 1
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 8
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 8
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con8_iter10_1k1k"
|
|
||||||
concurrency: 8
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp4_v2_dep4_mtp1_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 4
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 512
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: true
|
|
||||||
attention_dp_config:
|
|
||||||
batching_wait_iters: 0
|
|
||||||
enable_balance: true
|
|
||||||
timeout_iters: 60
|
|
||||||
moe_config:
|
|
||||||
backend: 'CUTLASS'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 512
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 1
|
|
||||||
client_configs:
|
|
||||||
- name: "con2048_iter10_1k1k"
|
|
||||||
concurrency: 2048
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 4
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 32
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 32
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con32_iter10_1k1k"
|
|
||||||
concurrency: 32
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
|
|
||||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
|
||||||
model_name: "deepseek_r1_0528_fp4_v2"
|
|
||||||
gpus: 4
|
|
||||||
tensor_parallel_size: 4
|
|
||||||
moe_expert_parallel_size: 1
|
|
||||||
pipeline_parallel_size: 1
|
|
||||||
max_batch_size: 4
|
|
||||||
max_num_tokens: 8192
|
|
||||||
attn_backend: "TRTLLM"
|
|
||||||
enable_attention_dp: false
|
|
||||||
moe_config:
|
|
||||||
backend: 'TRTLLM'
|
|
||||||
cuda_graph_config:
|
|
||||||
enable_padding: true
|
|
||||||
max_batch_size: 4
|
|
||||||
kv_cache_config:
|
|
||||||
dtype: 'fp8'
|
|
||||||
enable_block_reuse: false
|
|
||||||
free_gpu_memory_fraction: 0.8
|
|
||||||
speculative_config:
|
|
||||||
decoding_type: 'MTP'
|
|
||||||
num_nextn_predict_layers: 3
|
|
||||||
client_configs:
|
|
||||||
- name: "con4_iter10_1k1k"
|
|
||||||
concurrency: 4
|
|
||||||
iterations: 10
|
|
||||||
isl: 1024
|
|
||||||
osl: 1024
|
|
||||||
random_range_ratio: 0.2
|
|
||||||
backend: "openai"
|
|
||||||
Loading…
Reference in New Issue
Block a user