[TRTLLM-9834][feat] Transfer to TRTLLM-INFRA Database and Fail post-merge tests if regression (#10282)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
chenfeiz0326 2025-12-31 21:44:59 +08:00 committed by GitHub
parent 464847c6be
commit a23c6f1092
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 1995 additions and 1568 deletions

View File

@ -893,7 +893,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Create a unique suffix for the job name
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
def perfSanityMode = stageName.contains("PerfSanity")
def disaggMode = stageName.contains("PerfSanity-Disagg")
def setSegment = disaggMode
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@ -938,6 +939,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py"
stage("[${stageName}] Initializing Test") {
// Create Job Workspace folder in Frontend Node
@ -1020,6 +1023,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
coverageConfigFile
)
if (perfSanityMode) {
Utils.copyFileToRemoteHost(
pipeline,
remote,
perfCheckScriptLocal,
perfCheckScriptNode,
true
)
}
// Generate Pytest command
String pytestUtil = ""
if (nodeCount > 1) {
@ -1094,7 +1107,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Define environment variables to export
def envVarNames = [
'OPEN_SEARCH_DB_BASE_URL',
'OPEN_SEARCH_DB_CREDENTIALS',
'OPEN_SEARCH_DB_CREDENTIALS_USR',
'OPEN_SEARCH_DB_CREDENTIALS_PSW',
'BUILD_ID',
'BUILD_URL',
'JOB_NAME',
@ -1300,6 +1314,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
),
numRetries: 3
)
if (perfSanityMode) {
stage("[${stageName}] Check perf result") {
def perfCheckResult = Utils.exec(
pipeline,
script: Utils.sshUserCmd(
remote,
"python3 ${perfCheckScriptNode} ${jobWorkspace}"
),
returnStatus: true
)
if (perfCheckResult != 0) {
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
}
}
}
}
echo "Finished test stage execution."
@ -2785,7 +2815,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
error "Some tests still failed after rerun attempts, please check the test report."
}
if (perfMode && !stageName.contains("Perf-Sanity")) {
if (perfMode) {
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
stage("Check perf result") {
@ -2811,7 +2841,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
}
}
if (perfMode && stageName.contains("Perf-Sanity")) {
if (stageName.contains("PerfSanity")) {
stage ("Check perf result") {
def perfCheckResult = sh(
script: """
@ -2820,10 +2850,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
""",
returnStatus: true
)
// TODO: Enable this when perf regression check is stable
// if (perfCheckResult != 0) {
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
// }
if (perfCheckResult != 0) {
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
}
}
}
}
@ -3187,7 +3216,7 @@ def launchTestJobs(pipeline, testFilter)
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
]
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), {
def config = VANILLA_CONFIG
if (key.contains("single-device")) {
config = SINGLE_DEVICE_CONFIG
@ -3198,7 +3227,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("Pybind")) {
config = PYBIND_CONFIG
}
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
}]]}
fullSet = parallelJobs.keySet()
@ -3219,9 +3248,12 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// Perf sanity post merge test
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
]
fullSet += x86SlurmTestConfigs.keySet()
@ -3233,7 +3265,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
}]]}
parallelJobs += parallelSlurmJobs
@ -3252,11 +3284,30 @@ def launchTestJobs(pipeline, testFilter)
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
// Perf sanity post merge test
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
// Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
// Perf sanity pre merge test
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
// Perf sanity post merge test
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
]
fullSet += SBSASlurmTestConfigs.keySet()
@ -3268,13 +3319,15 @@ def launchTestJobs(pipeline, testFilter)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
// Perf sanity post merge aggr tests
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
// Perf sanity post merge disagg tests
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
// Perf sanity pre merge tests
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
// Perf sanity post merge tests
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
]
fullSet += multiNodesSBSAConfigs.keySet()
@ -3292,7 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
}]]}
parallelJobs += parallelSlurmJobs
@ -3305,7 +3358,7 @@ def launchTestJobs(pipeline, testFilter)
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
}]]}
parallelJobs += parallelMultiNodesSBSAJobs

View File

@ -51,6 +51,7 @@ TEST_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-test_info"
JOB_MACHINE_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-job_machine_info"
FAILED_STEP_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-failed_step_info"
PR_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-pr_info"
PERF_SANITY_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-perf_sanity_info"
READ_ACCESS_PROJECT_NAME = [
JOB_PROJECT_NAME,
@ -59,9 +60,12 @@ READ_ACCESS_PROJECT_NAME = [
JOB_MACHINE_PROJECT_NAME,
FAILED_STEP_PROJECT_NAME,
PR_PROJECT_NAME,
PERF_SANITY_PROJECT_NAME,
]
WRITE_ACCESS_PROJECT_NAME = []
WRITE_ACCESS_PROJECT_NAME = [
PERF_SANITY_PROJECT_NAME,
]
DISABLE_OPEN_SEARCH_DB_FOR_LOCAL_TEST = False

View File

@ -108,7 +108,7 @@ eval $pytestCommand
pytest_exit_code=$?
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
if [[ "$stageName" == *PyTorch* ]]; then
basePerfFilename="base_perf_pytorch.csv"
else
@ -135,14 +135,6 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
fi
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
echo "Check Perf-Sanity Result"
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
$jobWorkspace
perf_sanity_check_exit_code=$?
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
fi
if [ "$pytest_exit_code" -ne 0 ]; then
final_exit_code=$pytest_exit_code
elif [ "$perf_check_exit_code" -ne 0 ]; then

View File

@ -29,12 +29,14 @@ _project_root = os.path.abspath(
os.path.join(os.path.dirname(__file__), '../../../..'))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from jenkins.scripts.open_search_db import OpenSearchDB
from jenkins.scripts.open_search_db import (PERF_SANITY_PROJECT_NAME,
OpenSearchDB)
PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf"
TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
PRE_MERGE_THRESHOLD = 0.1
POST_MERGE_THRESHOLD = 0.05
POC_PROJECT_NAME = "sandbox-temp-trtllm-ci-perf-v1-test_info"
USE_POC_DB = os.environ.get("USE_POC_DB", "false").lower() == "true"
TEST_INFO_PROJECT_NAME = POC_PROJECT_NAME if USE_POC_DB else PERF_SANITY_PROJECT_NAME
MAX_QUERY_SIZE = 5000
QUERY_LOOKBACK_DAYS = 90
# Metrics where larger is better
MAXIMIZE_METRICS = [
@ -67,7 +69,6 @@ MINIMIZE_METRICS = [
SCENARIO_MATCH_FIELDS = [
"s_runtime",
"s_model_name",
"s_gpu_type",
"l_isl",
"l_osl",
"l_concurrency",
@ -178,49 +179,85 @@ def get_job_info():
}
def query_history_data(gpu_type):
def get_common_values(new_data_dict, match_keys):
"""
Query post-merge data with specific gpu type and model name
Find keys from match_keys where all data entries in new_data_dict have identical values.
Returns a dict with those common key-value pairs.
Skips entries that don't have the key or have None/empty values.
"""
# Query data from the last 14 days
last_days = 14
if not new_data_dict or not match_keys:
return {}
data_list = list(new_data_dict.values())
if not data_list:
return {}
common_values_dict = {}
for key in match_keys:
# Collect non-None, non-empty values for this key
values = []
for data in data_list:
if key in data and data[key] is not None:
values.append(data[key])
# Skip if no valid values found
if len(values) != len(data_list):
continue
# Check if all valid values are identical
first_value = values[0]
if all(v == first_value for v in values):
common_values_dict[key] = first_value
return common_values_dict
def query_history_data(common_values_dict):
"""
Query post-merge data with common values to narrow down scope.
"""
# Query data from the last 90 days
last_days = QUERY_LOOKBACK_DAYS
# Build must clauses with base filters
must_clauses = [
{
"term": {
"b_is_valid": True
}
},
{
"term": {
"b_is_post_merge": True
}
},
{
"term": {
"b_is_regression": False
}
},
{
"range": {
"ts_created": {
"gte":
int(time.time() - 24 * 3600 * last_days) // (24 * 3600) *
24 * 3600 * 1000,
}
}
},
]
# Add common values as term filters to narrow down the query
for key, value in common_values_dict.items():
must_clauses.append({"term": {key: value}})
json_data = {
"query": {
"bool": {
"must": [
{
"term": {
"b_is_valid": True
}
},
{
"term": {
"b_is_post_merge": True
}
},
{
"term": {
"b_is_regression": False
}
},
{
"term": {
"s_gpu_type": gpu_type
}
},
{
"range": {
"ts_created": {
"gte":
int(time.time() - 24 * 3600 * last_days) //
(24 * 3600) * 24 * 3600 * 1000,
}
}
},
]
"must": must_clauses
},
},
"size": 3000,
"size": MAX_QUERY_SIZE,
}
json_data = json.dumps(json_data)
@ -233,13 +270,13 @@ def query_history_data(gpu_type):
print_info(
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned no response"
)
return []
return None
else:
payload = res.json().get("hits", {}).get("hits", [])
if len(payload) == 0:
# No history data found in database, return empty list
print_info(
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned no data"
f"No history data found in {TEST_INFO_PROJECT_NAME}, returned empty list"
)
return []
for hit in payload:
@ -250,7 +287,7 @@ def query_history_data(gpu_type):
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned data with no _id"
)
# Invalid data, return None
return []
return None
data_list.append(data_dict)
print_info(
f"Successfully query from {TEST_INFO_PROJECT_NAME}, queried {len(data_list)} entries"
@ -259,7 +296,7 @@ def query_history_data(gpu_type):
except Exception as e:
print_info(
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned error: {e}")
return []
return None
def match(history_data, new_data, match_keys):
@ -329,7 +366,7 @@ def calculate_best_perf_result(history_data_list, new_data):
return best_metrics
def get_history_data(new_data_dict, gpu_type, match_keys):
def get_history_data(new_data_dict, match_keys, common_values_dict):
"""
Query history post-merge data for each cmd_idx
"""
@ -371,15 +408,23 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
key=lambda x: parse_timestamp(x.get("@timestamp", 0)))
return latest_data
cmd_idxs = new_data_dict.keys()
history_data_list = None
if cmd_idxs:
history_data_list = query_history_data(common_values_dict)
# If query_history_data returned None, it means network failure
if history_data_list is None:
return None, None
# Query was successful (even if empty list), initialize dicts
history_baseline_dict = {}
history_data_dict = {}
cmd_idxs = new_data_dict.keys()
for cmd_idx in cmd_idxs:
history_data_dict[cmd_idx] = []
history_baseline_dict[cmd_idx] = []
history_data_list = []
if cmd_idxs:
history_data_list = query_history_data(gpu_type)
# Process history data if we have any
if history_data_list:
for history_data in history_data_list:
for cmd_idx in cmd_idxs:
@ -390,7 +435,9 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
else:
history_data_dict[cmd_idx].append(history_data)
break
# Sometime database has several baselines and we only use the latest baseline one
# If list is empty, set to None for each cmd_idx
for cmd_idx, baseline_list in history_baseline_dict.items():
latest_baseline = get_latest_data(baseline_list)
history_baseline_dict[cmd_idx] = latest_baseline
@ -430,24 +477,27 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
2. For Minimize metrics, if new perf is above baseline * (1 + threshold)
Set it as regressive.
"""
# If history_baseline_dict is None (network failure), skip regression check
if history_baseline_dict is None:
return []
regressive_data_list = []
cmd_idxs = new_data_dict.keys()
# Find regressive test cases
for cmd_idx in cmd_idxs:
for cmd_idx in new_data_dict:
if history_baseline_dict[cmd_idx] is None:
continue
baseline_data = history_baseline_dict[cmd_idx]
history_baseline = history_baseline_dict[cmd_idx]
new_data = new_data_dict[cmd_idx]
is_regressive = False
regressive_metrics = []
# Check MAXIMIZE_METRICS (new should be >= baseline * (1 - threshold))
for metric in MAXIMIZE_METRICS:
if metric not in new_data or metric not in baseline_data:
if metric not in new_data or metric not in history_baseline:
continue
threshold = get_threshold(baseline_data, metric)
baseline_value = baseline_data[metric]
threshold = get_threshold(history_baseline, metric)
baseline_value = history_baseline[metric]
new_value = new_data[metric]
# Regressive if new_value < baseline_value * (1 - threshold)
if new_value < baseline_value * (1 - threshold):
@ -456,10 +506,10 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
# Check MINIMIZE_METRICS (new should be <= baseline * (1 + threshold))
for metric in MINIMIZE_METRICS:
if metric not in new_data or metric not in baseline_data:
if metric not in new_data or metric not in history_baseline:
continue
threshold = get_threshold(baseline_data, metric)
baseline_value = baseline_data[metric]
threshold = get_threshold(history_baseline, metric)
baseline_value = history_baseline[metric]
new_value = new_data[metric]
# Regressive if new_value > baseline_value * (1 + threshold)
if new_value > baseline_value * (1 + threshold):
@ -471,9 +521,9 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
regressive_data = new_data.copy()
# Add baseline values and thresholds for all metrics
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
if metric in baseline_data:
if metric in history_baseline:
baseline_key = f"d_baseline_{metric[2:]}"
regressive_data[baseline_key] = baseline_data[metric]
regressive_data[baseline_key] = history_baseline[metric]
# Copy all threshold keys from baseline
metric_suffix = metric[2:]
@ -482,8 +532,8 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
f"d_threshold_post_merge_{metric_suffix}",
f"d_threshold_pre_merge_{metric_suffix}"
]:
if threshold_key in baseline_data:
regressive_data[threshold_key] = baseline_data[
if threshold_key in history_baseline:
regressive_data[threshold_key] = history_baseline[
threshold_key]
# Add regression info string
@ -495,11 +545,24 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
return regressive_data_list
def prepare_baseline_data(history_data_dict, new_data_dict):
def _is_valid_baseline(baseline_data):
"""Check if baseline data is valid (non-empty dict)."""
if isinstance(baseline_data, dict) and len(baseline_data) > 0:
return True
return False
def prepare_baseline_data(history_baseline_dict, history_data_dict,
new_data_dict):
"""
Calculate new baseline from history post-merge data and new data.
Then return new baseline data.
"""
# If history_baseline_dict and history_data_dict are None (network failure),
# return None to indicate we cannot prepare baseline data
if history_baseline_dict is None and history_data_dict is None:
return {}
new_baseline_data_dict = {}
cmd_idxs = new_data_dict.keys()
# Find the best history post-merge data for each cmd
@ -507,18 +570,42 @@ def prepare_baseline_data(history_data_dict, new_data_dict):
# Calculate best metrics from history post-merge data and new data
best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx],
new_data_dict[cmd_idx])
# Create new_baseline_data from new_data_dict and set b_is_baseline
new_baseline_data = new_data_dict[cmd_idx].copy()
new_baseline_data["b_is_baseline"] = True
# Add or update baseline metrics and thresholds
for metric, value in best_metrics.items():
new_baseline_data[metric] = value
# Initialize metric_threshold_dict with default thresholds for all metrics
metric_threshold_dict = {}
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
metric_suffix = metric[2:]
post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
new_baseline_data[post_merge_key] = new_baseline_data.get(
post_merge_key, POST_MERGE_THRESHOLD)
new_baseline_data[pre_merge_key] = new_baseline_data.get(
pre_merge_key, PRE_MERGE_THRESHOLD)
metric_threshold_dict[post_merge_key] = POST_MERGE_THRESHOLD
metric_threshold_dict[pre_merge_key] = PRE_MERGE_THRESHOLD
# If history baseline is valid, extract thresholds and update metric_threshold_dict
history_baseline = history_baseline_dict[cmd_idx]
if _is_valid_baseline(history_baseline):
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
metric_suffix = metric[2:]
post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
if post_merge_key in history_baseline:
metric_threshold_dict[post_merge_key] = history_baseline[
post_merge_key]
if pre_merge_key in history_baseline:
metric_threshold_dict[pre_merge_key] = history_baseline[
pre_merge_key]
# Update new_baseline_data with best_metrics values
for metric, value in best_metrics.items():
new_baseline_data[metric] = value
# Add all thresholds to new_baseline_data
for threshold_key, threshold_value in metric_threshold_dict.items():
new_baseline_data[threshold_key] = threshold_value
add_id(new_baseline_data)
new_baseline_data_dict[cmd_idx] = new_baseline_data
@ -539,7 +626,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
if cmd_idx in new_data_dict:
data_list.append(new_data_dict[cmd_idx])
# Only post regressive test cases when post-merge.
if new_baseline_data_dict:
# new_baseline_data_dict is None means pre-merge.
if new_baseline_data_dict and regressive_data_list:
data_list.extend(regressive_data_list)
if not data_list:
return

View File

@ -148,8 +148,8 @@ def main():
job_workspace = sys.argv[1]
if not os.path.isdir(job_workspace):
print(f"Error: {job_workspace} is not a valid directory")
sys.exit(1)
print(f"Skipping perf regression check since {job_workspace} is not a valid directory.")
return 0
perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
all_perf_data = read_yaml_data(perf_data_files)
@ -171,15 +171,33 @@ def main():
print("=" * 60)
print_regression_data(data)
# Split regression data into post-merge and pre-merge categories
post_merge_regressions = [
data for data in all_regression_data if data.get("b_is_post_merge", False)
]
pre_merge_regressions = [
data for data in all_regression_data if not data.get("b_is_post_merge", False)
]
if len(all_regression_data) == 0:
print("\n No regression data found. Perf check is successful.")
return 0
else:
if len(pre_merge_regressions) > 0:
print(
f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed."
f"\n Warning: Found {len(pre_merge_regressions)} pre-merge regression data. "
"But we don't fail the check temporarily."
)
if len(post_merge_regressions) > 0:
print(
f"\n Error: Found {len(post_merge_regressions)} post-merge regression data. Perf check is failed."
)
return 1
print("\n No post-merge regression data found. Perf check is successful.")
return 0
if __name__ == "__main__":
sys.exit(main())

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -19,22 +19,17 @@ import io
import os
import re
import subprocess
import time
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Dict, List, NamedTuple, Optional
import requests
import yaml
from _pytest.nodes import Item
from _pytest.python import Function
from defs.trt_test_alternative import (check_output, popen, print_error,
print_info)
from test_common.http_utils import wait_for_endpoint_ready
from tensorrt_llm._utils import get_free_port
from ..common import get_trt_llm_lib_dir, venv_mpi_check_output
from ..local_venv import PythonVenvRunnerImpl
from ..test_list_parser import parse_test_list
@ -243,55 +238,6 @@ class PerfBenchScriptTestCmds(NamedTuple):
return cmd_str
class PerfAggrScriptTestCmds(NamedTuple):
server_cmds: List[List[str]]
client_cmds: List[List[str]]
names: List[str]
timeout: int
output_dir: str
def run_cmd(self, cmd_idx: int, venv) -> str:
output = ""
server_proc = None
server_file_path = os.path.join(
self.output_dir, f"trtllm-serve.{self.names[cmd_idx]}.log")
client_file_path = os.path.join(
self.output_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log")
try:
server_hostname = "localhost"
server_port = get_free_port()
server_cmd = add_host_port_to_cmd(self.server_cmds[cmd_idx],
server_hostname, server_port)
print_info(f"Starting server. cmd is {server_cmd}")
with open(server_file_path, 'w') as server_ctx:
server_proc = subprocess.Popen(
server_cmd,
stdout=server_ctx,
stderr=subprocess.STDOUT,
env=copy.deepcopy(os.environ),
)
wait_for_endpoint_ready(
f"http://{server_hostname}:{server_port}/health",
timeout=self.timeout)
client_cmd = add_host_port_to_cmd(self.client_cmds[cmd_idx],
server_hostname, server_port)
print_info(f"Starting client. cmd is {client_cmd}")
output = subprocess.check_output(
client_cmd,
stderr=subprocess.STDOUT,
env=copy.deepcopy(os.environ),
).decode()
with open(client_file_path, 'w') as client_ctx:
client_ctx.write(output)
finally:
server_proc.terminate()
server_proc.wait()
return output
def get_cmd_str(self, cmd_idx) -> List[str]:
return ["aggr_server tests, please check config files"]
class PerfDisaggScriptTestCmds(NamedTuple):
ctx_cmd: str
gen_cmd: str
@ -341,249 +287,6 @@ class PerfDisaggScriptTestCmds(NamedTuple):
return ["disaggregated server tests, please check config files"]
class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
ctx_server_cmds: List[List[str]]
gen_server_cmds: List[List[str]]
disagg_server_cmds: List[List[str]]
benchmark_cmds: List[List[str]]
timeout: int
hostname: str
disagg_serving_type: str
num_ctx_servers: int
num_gen_servers: int
output_dir: str
def _generate_hostname_file(self, cmd_idx: int, port: int):
# Create hostnames directory
hostnames_dir = os.path.join(self.output_dir, f"hostnames-{cmd_idx}")
if not os.path.exists(hostnames_dir):
os.makedirs(hostnames_dir, exist_ok=True)
hostname_file = os.path.join(hostnames_dir,
f"{self.disagg_serving_type}.txt")
with open(hostname_file, 'w') as f:
f.write(f"{self.hostname}:{port}")
def _generate_disagg_server_config(self, cmd_idx: int,
disagg_server_port: int) -> str:
print_info(
f"Generating disagg server config for command index {cmd_idx}")
hostnames_folder = os.path.join(self.output_dir, f"hostnames-{cmd_idx}")
expected_count = self.num_ctx_servers + self.num_gen_servers
start_time = time.time()
hostnames = []
while True:
elapsed_time = time.time() - start_time
print_info(
f"Waiting for hostnames in {hostnames_folder}, elapsed time: {elapsed_time}s, current: {len(hostnames)}, expected: {expected_count}"
)
if elapsed_time > self.timeout:
print_error(
f"Time out. Hostnames files are not ready after {self.timeout}s"
)
time.sleep(10)
if not os.path.exists(hostnames_folder):
continue
hostnames = os.listdir(hostnames_folder)
if len(hostnames) >= expected_count:
break
print_info(
f"All hostnames found in {hostnames_folder} after elapsed time: {elapsed_time}s"
)
# Read ctx and gen hostnames
ctx_hostnames = []
gen_hostnames = []
for hostname_file in hostnames:
hostname_file_path = os.path.join(hostnames_folder, hostname_file)
with open(hostname_file_path, 'r') as f:
hostname_port = f.read().strip()
hostname = hostname_port.split(":")[0]
port = hostname_port.split(":")[1]
print_info(
f"Hostname File: {hostname_file_path} Hostname: {hostname_port} Port: {port}"
)
if hostname_file.startswith("CTX"):
ctx_hostnames.append(hostname_port)
elif hostname_file.startswith("GEN"):
gen_hostnames.append(hostname_port)
server_config = {
'hostname': self.hostname,
'port': disagg_server_port,
'backend': 'pytorch',
'context_servers': {
'num_instances': self.num_ctx_servers,
'urls': ctx_hostnames,
},
'generation_servers': {
'num_instances': self.num_gen_servers,
'urls': gen_hostnames,
}
}
config_path = os.path.join(self.output_dir,
f"server_config.{cmd_idx}.yaml")
with open(config_path, 'w') as f:
yaml.dump(server_config, f)
print_info(f"Server config file {config_path} generated")
return config_path
def _get_disagg_server_hostname_and_port(self, cmd_idx: int) -> tuple:
config_path = os.path.join(self.output_dir,
f"server_config.{cmd_idx}.yaml")
start_time = time.time()
while True:
if os.path.exists(config_path):
print_info(f"Server config file found: {config_path}")
break
elapsed_time = time.time() - start_time
if elapsed_time > self.timeout:
print_error(
f"Server config file {config_path} not found after {self.timeout}s"
)
print_info(
f"Waiting for server config file, elapsed time: {elapsed_time}s"
)
time.sleep(10) # Check every 10 seconds
# Read server config to get hostname and port
with open(config_path, 'r') as f:
server_config = yaml.safe_load(f)
disagg_server_hostname = server_config['hostname']
disagg_server_port = server_config['port']
return disagg_server_hostname, disagg_server_port
def wait_for_benchmark_ready(self,
benchmark_status_file: str,
timeout: int = 7200):
start_time = time.time()
while True:
if os.path.exists(benchmark_status_file):
print_info(
f"Benchmark status file found, terminating server {self.disagg_serving_type}"
)
break
elapsed_time = time.time() - start_time
print_info(
f"Waiting for benchmark status file, elapsed time: {elapsed_time}s"
)
if elapsed_time > timeout:
print_error(
f"Timeout waiting for benchmark status file after {timeout}s, terminating server {self.disagg_serving_type}"
)
break
time.sleep(10) # Check every 10 seconds
def wait_for_endpoint_ready(self, url: str, timeout: int = 7200):
start = time.monotonic()
while True:
elapsed_time = time.monotonic() - start
if elapsed_time > timeout:
print_error(
f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds"
)
break
print_info(
f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s"
)
try:
time.sleep(10)
if requests.get(url).status_code == 200:
print_info(f"endpoint {url} is ready")
return
except Exception as err:
print_info(
f"endpoint {url} is not ready, with exception: {err}")
print_error(
f"Endpoint {url} did not become ready within {timeout} seconds")
def run_cmd(self, cmd_idx: int, venv) -> str:
output = ""
server_proc = None
benchmark_status_file = os.path.join(self.output_dir,
f"benchmark_status.{cmd_idx}.txt")
port = get_free_port()
if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type:
self._generate_hostname_file(cmd_idx, port)
server_file_path = os.path.join(
self.output_dir,
f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
is_ctx = "CTX" in self.disagg_serving_type
server_cmd = self.ctx_server_cmds[
cmd_idx] if is_ctx else self.gen_server_cmds[cmd_idx]
server_cmd = add_host_port_to_cmd(server_cmd, self.hostname, port)
try:
print_info(
f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd}"
)
with open(server_file_path, 'w') as server_ctx:
server_proc = subprocess.Popen(
server_cmd,
stdout=server_ctx,
stderr=subprocess.STDOUT,
env=copy.deepcopy(os.environ),
)
self.wait_for_benchmark_ready(benchmark_status_file,
timeout=self.timeout)
finally:
print_info(f"Server {self.disagg_serving_type} stopped")
server_proc.terminate()
server_proc.wait()
elif self.disagg_serving_type == "DISAGG_SERVER":
disagg_server_file_path = os.path.join(
self.output_dir,
f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
disagg_server_cmd = self.disagg_server_cmds[cmd_idx]
try:
self._generate_disagg_server_config(cmd_idx, port)
print_info(
f"Starting disagg server. disagg_serving_type: {self.disagg_serving_type} disagg server cmd is {disagg_server_cmd}"
)
with open(disagg_server_file_path, 'w') as disagg_server_ctx:
disagg_server_proc = subprocess.Popen(
disagg_server_cmd,
stdout=disagg_server_ctx,
stderr=subprocess.STDOUT,
env=copy.deepcopy(os.environ),
)
self.wait_for_benchmark_ready(benchmark_status_file,
timeout=self.timeout)
finally:
print_info(f"Disagg server {self.disagg_serving_type} stopped")
disagg_server_proc.terminate()
disagg_server_proc.wait()
elif self.disagg_serving_type == "BENCHMARK":
benchmark_file_path = os.path.join(
self.output_dir, f"trtllm-benchmark.{cmd_idx}.log")
try:
disagg_server_hostname, disagg_server_port = self._get_disagg_server_hostname_and_port(
cmd_idx)
benchmark_cmd = add_host_port_to_cmd(
self.benchmark_cmds[cmd_idx], disagg_server_hostname,
disagg_server_port)
self.wait_for_endpoint_ready(
f"http://{disagg_server_hostname}:{disagg_server_port}/health",
timeout=self.timeout,
)
print_info(
f"Starting benchmark. disagg_serving_type: {self.disagg_serving_type} benchmark cmd is {benchmark_cmd}"
)
output = subprocess.check_output(
benchmark_cmd,
env=copy.deepcopy(os.environ),
stderr=subprocess.STDOUT).decode()
with open(benchmark_file_path, 'w') as benchmark_ctx:
benchmark_ctx.write(output)
finally:
with open(benchmark_status_file, 'w') as status_file:
status_file.write("Done")
return output
def get_cmd_str(self, cmd_idx) -> List[str]:
return [
"multi-node disaggregated server tests, please check config files"
]
class AbstractPerfScriptTestClass(abc.ABC):
"""
Abstract class for all script-based perf tests.
@ -715,14 +418,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
cmd_str = commands.get_cmd_str(cmd_idx)
is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str
is_perf_sanity_test = "perf_sanity" in full_test_name
is_disagg_server = False
if self._config.runtime == "multi_node_disagg_server":
disagg_serving_type = self._config.disagg_configs[0][
'disagg_serving_type']
is_disagg_server = disagg_serving_type != "BENCHMARK"
# Start the timer.
self._start_timestamp = datetime.utcnow()
try:
@ -730,8 +425,7 @@ class AbstractPerfScriptTestClass(abc.ABC):
# Capture the stdout from _gpu_clock_lock because the pipeline JUnit update script tries to parse
# the log to find the GPU clocks.
with io.StringIO() as buf:
# Perf-sanity test doesn't lock gpu clock
if self._gpu_clock_lock and not is_perf_sanity_test:
if self._gpu_clock_lock:
# Lock GPU clock and start monitoring.
with contextlib.redirect_stdout(
buf), self._gpu_clock_lock, tmpDir:
@ -746,7 +440,7 @@ class AbstractPerfScriptTestClass(abc.ABC):
print(collect_and_clean_myelin_time(output))
# Check whether output has error message
if not is_prepare_dataset_cmd and is_perf_sanity_test:
if not is_prepare_dataset_cmd:
self._check_benchmark_output_for_errors(output)
# Print the output log to stdout and cache it.
@ -793,10 +487,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
f"skip writing perf result when calling generating dataset in trtllm-bench."
)
outputs.pop(cmd_idx)
elif is_disagg_server:
print_info(
f"skip writing perf result when running disagg's worker or server."
)
else:
self._perf_result = self.get_perf_result(outputs)
@ -818,11 +508,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
Store the test results in the _test_results.
Write the test results and GPU monitoring data to the output csv and/or yaml files.
"""
# Store the test result
if cmd_idx not in self._test_results:
self._test_results[cmd_idx] = {}
self._test_results[cmd_idx][metric_type] = self._perf_result
# Get GPU monitoring data
self._gpu_monitor_data = self._gpu_clock_lock.get_state_data()
if not self._gpu_monitor_data:

View File

@ -15,9 +15,9 @@ l0_dgx_b200_perf_sanity:
backend: pytorch
orchestrator: mpi
tests:
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
- condition:
ranges:
@ -34,8 +34,6 @@ l0_dgx_b200_perf_sanity:
backend: pytorch
orchestrator: mpi
tests:
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)

View File

@ -16,9 +16,9 @@ l0_dgx_b300_perf_sanity:
backend: pytorch
orchestrator: mpi
tests:
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
- condition:
ranges:
@ -36,6 +36,6 @@ l0_dgx_b300_perf_sanity:
backend: pytorch
orchestrator: mpi
tests:
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)

View File

@ -1,5 +1,33 @@
version: 0.0.1
l0_gb200_multi_gpus_perf_sanity:
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*gb200*'
linux_distribution_name: ubuntu*
cpu: aarch64
terms:
stage: pre_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
- condition:
ranges:
system_gpu_count:
@ -14,6 +42,17 @@ l0_gb200_multi_gpus_perf_sanity:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k]
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)

View File

@ -0,0 +1,17 @@
version: 0.0.1
l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes:
- condition:
ranges:
# 2 nodes with each node has 4 GPUs
system_gpu_count:
gte: 8
lte: 8
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] TIMEOUT (90)

View File

@ -1,16 +0,0 @@
version: 0.0.1
l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001:
- condition:
ranges:
# 2 nodes with each node has 4 GPUs
system_gpu_count:
gte: 8
lte: 8
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1]

View File

@ -1,5 +1,5 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes:
- condition:
ranges:
# 3 nodes with each node has 4 GPUs
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (90)

View File

@ -0,0 +1,17 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes:
- condition:
ranges:
# 6 nodes with each node has 4 GPUs
system_gpu_count:
gte: 24
lte: 24
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (90)

View File

@ -1,16 +0,0 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001:
- condition:
ranges:
# 6 nodes with each node has 4 GPUs
system_gpu_count:
gte: 24
lte: 24
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)

View File

@ -1,16 +0,0 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002:
- condition:
ranges:
# 6 nodes with each node has 4 GPUs
system_gpu_count:
gte: 24
lte: 24
wildcards:
gpu:
- '*gb200*'
terms:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (180)

View File

@ -1,5 +1,5 @@
version: 0.0.1
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes:
- condition:
ranges:
# 8 nodes with each node has 4 GPUs
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
stage: post_merge
backend: pytorch
tests:
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)

View File

@ -11,24 +11,69 @@ Performance sanity testing scripts for TensorRT-LLM with configuration-driven te
## Configuration File Types
There are three types of YAML config files for different deployment architectures.
Aggregated config files are in [`tests/scripts/perf-sanity`](./).
Disaggregated config files are in [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf).
There are two modes for perf sanity tests: aggregated (aggr) and disaggregated (disagg).
### 1. Single-Node Aggregated Test Configuration
### Aggregated Mode (aggr)
**File Example**: `deepseek_r1_fp4_v2_grace_blackwell.yaml`
**Config Location**: [`tests/scripts/perf-sanity`](./)
**Use Case**: Single-node performance tests on a single server with multiple GPUs.
**File Naming**: `xxx.yaml` where words are connected by `_` (underscore), not `-` (hyphen).
### 2. Multi-Node Aggregated Test Configuration
**File Examples**:
- `deepseek_r1_fp4_v2_grace_blackwell.yaml` - Single-node aggregated test
- `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml` - Multi-node aggregated test
**File Example**: `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml`
**Use Cases**:
- Single-node: Performance tests on a single server with multiple GPUs
- Multi-node: Model runs across multiple nodes with unified execution
**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
**Test Case Names**:
```
perf/test_perf_sanity.py::test_e2e[aggr_upload-{config yaml file base name}]
perf/test_perf_sanity.py::test_e2e[aggr_upload-{config yaml file base name}-{server_config_name}]
```
### 3. Multi-Node Disaggregated Test Configuration
- Without server config name: runs all server configs in the YAML file
- With server config name: runs only the specified server config (the `name` field in `server_configs`)
**Examples**:
```
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell]
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
```
### Disaggregated Mode (disagg)
**Config Location**: [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf)
**File Naming**: `xxx.yaml` (can contain `-` hyphen).
**File Example**: `deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml`
**Use Case**: Disaggregated architecture where model runs across multiple nodes with separate context (prefill) and generation (decode) servers.
**Test Case Name**:
```
perf/test_perf_sanity.py::test_e2e[disagg_upload-{config yaml file base name}]
```
**Example**:
```
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX]
```
## Running Tests
**Important**: Do NOT add `--perf` flag when running pytest. Perf sanity tests are static test cases and do not use perf mode.
```bash
# Run all server configs in an aggregated test
pytest perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell]
# Run a specific server config in an aggregated test
pytest perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
# Run a specific disaggregated test
pytest perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX]
```

View File

@ -35,8 +35,9 @@ server_configs:
iterations: 12
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tep8_mtp3"
model_name: "deepseek_r1_0528_fp4_v2"
trust_remote_code: true
@ -64,5 +65,5 @@ server_configs:
iterations: 12
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"

View File

@ -30,12 +30,12 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter10_1k1k"
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 10
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
@ -65,7 +65,7 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
@ -95,5 +95,5 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"

View File

@ -31,12 +31,12 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter10_1k1k"
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 10
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
@ -66,7 +66,7 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
@ -96,7 +96,7 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
# 8k1k configs
@ -126,12 +126,12 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter10_8k1k"
- name: "con2048_iter5_8k1k"
concurrency: 2048
iterations: 10
iterations: 5
isl: 8192
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
@ -161,7 +161,7 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
@ -191,7 +191,7 @@ server_configs:
iterations: 10
isl: 8192
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
# 1k8k configs
@ -221,12 +221,12 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con2048_iter10_1k8k"
- name: "con2048_iter5_1k8k"
concurrency: 2048
iterations: 10
iterations: 5
isl: 1024
osl: 8192
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tep4_mtp3_1k8k"
@ -256,7 +256,7 @@ server_configs:
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp4_v2_tp4_mtp3_1k8k"
@ -286,5 +286,5 @@ server_configs:
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"

View File

@ -30,12 +30,12 @@ server_configs:
decoding_type: 'MTP'
num_nextn_predict_layers: 1
client_configs:
- name: "con4096_iter10_1k1k"
- name: "con4096_iter5_1k1k"
concurrency: 4096
iterations: 10
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp8_tep8_mtp3_1k1k"
@ -65,7 +65,7 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "r1_fp8_tp8_mtp3_1k1k"
@ -95,5 +95,5 @@ server_configs:
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"

View File

@ -4,6 +4,37 @@ metadata:
- B200
- B300
server_configs:
- name: "gpt_oss_fp4_dep4_1k8k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 4
moe_expert_parallel_size: 4
pipeline_parallel_size: 1
max_batch_size: 640
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
enable_balance: true
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 640
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
num_postprocess_workers: 4
stream_interval: 20
client_configs:
- name: "con2560_iter5_1k8k"
concurrency: 2560
iterations: 5
isl: 1024
osl: 8192
random_range_ratio: 0.8
backend: "openai"
- name: "gpt_oss_fp4_dep2_1k1k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 2
@ -32,25 +63,23 @@ server_configs:
iterations: 5
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"
- name: "gpt_oss_fp4_dep4_1k1k"
- name: "gpt_oss_fp4_tep2_1k8k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 4
moe_expert_parallel_size: 4
tensor_parallel_size: 2
moe_expert_parallel_size: 2
pipeline_parallel_size: 1
max_batch_size: 512
max_batch_size: 128
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: true
attention_dp_config:
enable_balance: true
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 512
max_batch_size: 128
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
@ -58,12 +87,41 @@ server_configs:
num_postprocess_workers: 4
stream_interval: 20
client_configs:
- name: "con2048_iter5_1k1k"
concurrency: 2048
iterations: 5
- name: "con128_iter10_1k8k"
concurrency: 128
iterations: 10
isl: 1024
osl: 1024
random_range_ratio: 0.2
osl: 8192
random_range_ratio: 0.8
backend: "openai"
- name: "gpt_oss_fp4_tp2_1k8k"
model_name: "gpt_oss_120b_fp4"
tensor_parallel_size: 2
moe_expert_parallel_size: 1
pipeline_parallel_size: 1
max_batch_size: 8
max_num_tokens: 20000
attn_backend: "TRTLLM"
enable_attention_dp: false
moe_config:
backend: 'TRTLLM'
cuda_graph_config:
enable_padding: true
max_batch_size: 8
kv_cache_config:
dtype: 'fp8'
enable_block_reuse: false
free_gpu_memory_fraction: 0.8
num_postprocess_workers: 4
stream_interval: 20
client_configs:
- name: "con8_iter10_1k8k"
concurrency: 8
iterations: 10
isl: 1024
osl: 8192
random_range_ratio: 0.8
backend: "openai"
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
@ -97,5 +155,5 @@ server_configs:
iterations: 32
isl: 1024
osl: 1024
random_range_ratio: 0.2
random_range_ratio: 0.8
backend: "openai"