mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-9834][feat] Transfer to TRTLLM-INFRA Database and Fail post-merge tests if regression (#10282)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
This commit is contained in:
parent
464847c6be
commit
a23c6f1092
@ -893,7 +893,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
// Create a unique suffix for the job name
|
||||
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
|
||||
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
|
||||
def disaggMode = stageName.contains("Perf-Sanity-Disagg")
|
||||
def perfSanityMode = stageName.contains("PerfSanity")
|
||||
def disaggMode = stageName.contains("PerfSanity-Disagg")
|
||||
def setSegment = disaggMode
|
||||
|
||||
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
|
||||
@ -938,6 +939,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
|
||||
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
|
||||
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
|
||||
def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
|
||||
def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py"
|
||||
|
||||
stage("[${stageName}] Initializing Test") {
|
||||
// Create Job Workspace folder in Frontend Node
|
||||
@ -1020,6 +1023,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
coverageConfigFile
|
||||
)
|
||||
|
||||
if (perfSanityMode) {
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
perfCheckScriptLocal,
|
||||
perfCheckScriptNode,
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
// Generate Pytest command
|
||||
String pytestUtil = ""
|
||||
if (nodeCount > 1) {
|
||||
@ -1094,7 +1107,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
// Define environment variables to export
|
||||
def envVarNames = [
|
||||
'OPEN_SEARCH_DB_BASE_URL',
|
||||
'OPEN_SEARCH_DB_CREDENTIALS',
|
||||
'OPEN_SEARCH_DB_CREDENTIALS_USR',
|
||||
'OPEN_SEARCH_DB_CREDENTIALS_PSW',
|
||||
'BUILD_ID',
|
||||
'BUILD_URL',
|
||||
'JOB_NAME',
|
||||
@ -1300,6 +1314,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
),
|
||||
numRetries: 3
|
||||
)
|
||||
|
||||
if (perfSanityMode) {
|
||||
stage("[${stageName}] Check perf result") {
|
||||
def perfCheckResult = Utils.exec(
|
||||
pipeline,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"python3 ${perfCheckScriptNode} ${jobWorkspace}"
|
||||
),
|
||||
returnStatus: true
|
||||
)
|
||||
if (perfCheckResult != 0) {
|
||||
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "Finished test stage execution."
|
||||
@ -2785,7 +2815,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
error "Some tests still failed after rerun attempts, please check the test report."
|
||||
}
|
||||
|
||||
if (perfMode && !stageName.contains("Perf-Sanity")) {
|
||||
if (perfMode) {
|
||||
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
|
||||
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
|
||||
stage("Check perf result") {
|
||||
@ -2811,7 +2841,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
}
|
||||
}
|
||||
|
||||
if (perfMode && stageName.contains("Perf-Sanity")) {
|
||||
if (stageName.contains("PerfSanity")) {
|
||||
stage ("Check perf result") {
|
||||
def perfCheckResult = sh(
|
||||
script: """
|
||||
@ -2820,10 +2850,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
""",
|
||||
returnStatus: true
|
||||
)
|
||||
// TODO: Enable this when perf regression check is stable
|
||||
// if (perfCheckResult != 0) {
|
||||
// error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
|
||||
// }
|
||||
if (perfCheckResult != 0) {
|
||||
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -3187,7 +3216,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
|
||||
]
|
||||
|
||||
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
|
||||
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), {
|
||||
def config = VANILLA_CONFIG
|
||||
if (key.contains("single-device")) {
|
||||
config = SINGLE_DEVICE_CONFIG
|
||||
@ -3198,7 +3227,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
if (key.contains("Pybind")) {
|
||||
config = PYBIND_CONFIG
|
||||
}
|
||||
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
|
||||
runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
|
||||
}]]}
|
||||
fullSet = parallelJobs.keySet()
|
||||
|
||||
@ -3219,9 +3248,12 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
|
||||
// Perf sanity post merge test
|
||||
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
|
||||
// "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
|
||||
// "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
|
||||
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
|
||||
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
|
||||
// "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
|
||||
]
|
||||
fullSet += x86SlurmTestConfigs.keySet()
|
||||
|
||||
@ -3233,7 +3265,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
if (key.contains("llvm")) {
|
||||
config = LLVM_CONFIG
|
||||
}
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
|
||||
}]]}
|
||||
|
||||
parallelJobs += parallelSlurmJobs
|
||||
@ -3252,11 +3284,30 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
|
||||
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
|
||||
"GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
|
||||
// Perf sanity post merge test
|
||||
"GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
|
||||
// Disable GB300 stages due to nodes will be offline temporarily.
|
||||
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
|
||||
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
|
||||
// Perf sanity pre merge test
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
|
||||
// Perf sanity post merge test
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
|
||||
]
|
||||
fullSet += SBSASlurmTestConfigs.keySet()
|
||||
|
||||
@ -3268,13 +3319,15 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
|
||||
// Perf sanity post merge aggr tests
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
|
||||
// Perf sanity post merge disagg tests
|
||||
"GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
|
||||
// "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
|
||||
// Perf sanity pre merge tests
|
||||
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
|
||||
// Perf sanity post merge tests
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
|
||||
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
|
||||
// "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
|
||||
// "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
|
||||
]
|
||||
fullSet += multiNodesSBSAConfigs.keySet()
|
||||
|
||||
@ -3292,7 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
if (key.contains("llvm")) {
|
||||
config = LLVM_CONFIG
|
||||
}
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
|
||||
}]]}
|
||||
parallelJobs += parallelSlurmJobs
|
||||
|
||||
@ -3305,7 +3358,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
if (key.contains("llvm")) {
|
||||
config = LLVM_CONFIG
|
||||
}
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
|
||||
}]]}
|
||||
|
||||
parallelJobs += parallelMultiNodesSBSAJobs
|
||||
|
||||
@ -51,6 +51,7 @@ TEST_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-test_info"
|
||||
JOB_MACHINE_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-job_machine_info"
|
||||
FAILED_STEP_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-failed_step_info"
|
||||
PR_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-pr_info"
|
||||
PERF_SANITY_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-perf_sanity_info"
|
||||
|
||||
READ_ACCESS_PROJECT_NAME = [
|
||||
JOB_PROJECT_NAME,
|
||||
@ -59,9 +60,12 @@ READ_ACCESS_PROJECT_NAME = [
|
||||
JOB_MACHINE_PROJECT_NAME,
|
||||
FAILED_STEP_PROJECT_NAME,
|
||||
PR_PROJECT_NAME,
|
||||
PERF_SANITY_PROJECT_NAME,
|
||||
]
|
||||
|
||||
WRITE_ACCESS_PROJECT_NAME = []
|
||||
WRITE_ACCESS_PROJECT_NAME = [
|
||||
PERF_SANITY_PROJECT_NAME,
|
||||
]
|
||||
|
||||
DISABLE_OPEN_SEARCH_DB_FOR_LOCAL_TEST = False
|
||||
|
||||
|
||||
@ -108,7 +108,7 @@ eval $pytestCommand
|
||||
pytest_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
|
||||
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
|
||||
if [[ "$stageName" == *PyTorch* ]]; then
|
||||
basePerfFilename="base_perf_pytorch.csv"
|
||||
else
|
||||
@ -135,14 +135,6 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
|
||||
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
|
||||
echo "Check Perf-Sanity Result"
|
||||
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
|
||||
$jobWorkspace
|
||||
perf_sanity_check_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ "$pytest_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$pytest_exit_code
|
||||
elif [ "$perf_check_exit_code" -ne 0 ]; then
|
||||
|
||||
@ -29,12 +29,14 @@ _project_root = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), '../../../..'))
|
||||
if _project_root not in sys.path:
|
||||
sys.path.insert(0, _project_root)
|
||||
from jenkins.scripts.open_search_db import OpenSearchDB
|
||||
from jenkins.scripts.open_search_db import (PERF_SANITY_PROJECT_NAME,
|
||||
OpenSearchDB)
|
||||
|
||||
PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf"
|
||||
TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
|
||||
PRE_MERGE_THRESHOLD = 0.1
|
||||
POST_MERGE_THRESHOLD = 0.05
|
||||
POC_PROJECT_NAME = "sandbox-temp-trtllm-ci-perf-v1-test_info"
|
||||
USE_POC_DB = os.environ.get("USE_POC_DB", "false").lower() == "true"
|
||||
TEST_INFO_PROJECT_NAME = POC_PROJECT_NAME if USE_POC_DB else PERF_SANITY_PROJECT_NAME
|
||||
MAX_QUERY_SIZE = 5000
|
||||
QUERY_LOOKBACK_DAYS = 90
|
||||
|
||||
# Metrics where larger is better
|
||||
MAXIMIZE_METRICS = [
|
||||
@ -67,7 +69,6 @@ MINIMIZE_METRICS = [
|
||||
SCENARIO_MATCH_FIELDS = [
|
||||
"s_runtime",
|
||||
"s_model_name",
|
||||
"s_gpu_type",
|
||||
"l_isl",
|
||||
"l_osl",
|
||||
"l_concurrency",
|
||||
@ -178,49 +179,85 @@ def get_job_info():
|
||||
}
|
||||
|
||||
|
||||
def query_history_data(gpu_type):
|
||||
def get_common_values(new_data_dict, match_keys):
|
||||
"""
|
||||
Query post-merge data with specific gpu type and model name
|
||||
Find keys from match_keys where all data entries in new_data_dict have identical values.
|
||||
Returns a dict with those common key-value pairs.
|
||||
Skips entries that don't have the key or have None/empty values.
|
||||
"""
|
||||
# Query data from the last 14 days
|
||||
last_days = 14
|
||||
if not new_data_dict or not match_keys:
|
||||
return {}
|
||||
|
||||
data_list = list(new_data_dict.values())
|
||||
if not data_list:
|
||||
return {}
|
||||
|
||||
common_values_dict = {}
|
||||
for key in match_keys:
|
||||
# Collect non-None, non-empty values for this key
|
||||
values = []
|
||||
for data in data_list:
|
||||
if key in data and data[key] is not None:
|
||||
values.append(data[key])
|
||||
|
||||
# Skip if no valid values found
|
||||
if len(values) != len(data_list):
|
||||
continue
|
||||
|
||||
# Check if all valid values are identical
|
||||
first_value = values[0]
|
||||
if all(v == first_value for v in values):
|
||||
common_values_dict[key] = first_value
|
||||
|
||||
return common_values_dict
|
||||
|
||||
|
||||
def query_history_data(common_values_dict):
|
||||
"""
|
||||
Query post-merge data with common values to narrow down scope.
|
||||
"""
|
||||
# Query data from the last 90 days
|
||||
last_days = QUERY_LOOKBACK_DAYS
|
||||
|
||||
# Build must clauses with base filters
|
||||
must_clauses = [
|
||||
{
|
||||
"term": {
|
||||
"b_is_valid": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"term": {
|
||||
"b_is_post_merge": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"term": {
|
||||
"b_is_regression": False
|
||||
}
|
||||
},
|
||||
{
|
||||
"range": {
|
||||
"ts_created": {
|
||||
"gte":
|
||||
int(time.time() - 24 * 3600 * last_days) // (24 * 3600) *
|
||||
24 * 3600 * 1000,
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
# Add common values as term filters to narrow down the query
|
||||
for key, value in common_values_dict.items():
|
||||
must_clauses.append({"term": {key: value}})
|
||||
|
||||
json_data = {
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{
|
||||
"term": {
|
||||
"b_is_valid": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"term": {
|
||||
"b_is_post_merge": True
|
||||
}
|
||||
},
|
||||
{
|
||||
"term": {
|
||||
"b_is_regression": False
|
||||
}
|
||||
},
|
||||
{
|
||||
"term": {
|
||||
"s_gpu_type": gpu_type
|
||||
}
|
||||
},
|
||||
{
|
||||
"range": {
|
||||
"ts_created": {
|
||||
"gte":
|
||||
int(time.time() - 24 * 3600 * last_days) //
|
||||
(24 * 3600) * 24 * 3600 * 1000,
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
"must": must_clauses
|
||||
},
|
||||
},
|
||||
"size": 3000,
|
||||
"size": MAX_QUERY_SIZE,
|
||||
}
|
||||
json_data = json.dumps(json_data)
|
||||
|
||||
@ -233,13 +270,13 @@ def query_history_data(gpu_type):
|
||||
print_info(
|
||||
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned no response"
|
||||
)
|
||||
return []
|
||||
return None
|
||||
else:
|
||||
payload = res.json().get("hits", {}).get("hits", [])
|
||||
if len(payload) == 0:
|
||||
# No history data found in database, return empty list
|
||||
print_info(
|
||||
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned no data"
|
||||
f"No history data found in {TEST_INFO_PROJECT_NAME}, returned empty list"
|
||||
)
|
||||
return []
|
||||
for hit in payload:
|
||||
@ -250,7 +287,7 @@ def query_history_data(gpu_type):
|
||||
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned data with no _id"
|
||||
)
|
||||
# Invalid data, return None
|
||||
return []
|
||||
return None
|
||||
data_list.append(data_dict)
|
||||
print_info(
|
||||
f"Successfully query from {TEST_INFO_PROJECT_NAME}, queried {len(data_list)} entries"
|
||||
@ -259,7 +296,7 @@ def query_history_data(gpu_type):
|
||||
except Exception as e:
|
||||
print_info(
|
||||
f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned error: {e}")
|
||||
return []
|
||||
return None
|
||||
|
||||
|
||||
def match(history_data, new_data, match_keys):
|
||||
@ -329,7 +366,7 @@ def calculate_best_perf_result(history_data_list, new_data):
|
||||
return best_metrics
|
||||
|
||||
|
||||
def get_history_data(new_data_dict, gpu_type, match_keys):
|
||||
def get_history_data(new_data_dict, match_keys, common_values_dict):
|
||||
"""
|
||||
Query history post-merge data for each cmd_idx
|
||||
"""
|
||||
@ -371,15 +408,23 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
|
||||
key=lambda x: parse_timestamp(x.get("@timestamp", 0)))
|
||||
return latest_data
|
||||
|
||||
cmd_idxs = new_data_dict.keys()
|
||||
history_data_list = None
|
||||
if cmd_idxs:
|
||||
history_data_list = query_history_data(common_values_dict)
|
||||
|
||||
# If query_history_data returned None, it means network failure
|
||||
if history_data_list is None:
|
||||
return None, None
|
||||
|
||||
# Query was successful (even if empty list), initialize dicts
|
||||
history_baseline_dict = {}
|
||||
history_data_dict = {}
|
||||
cmd_idxs = new_data_dict.keys()
|
||||
for cmd_idx in cmd_idxs:
|
||||
history_data_dict[cmd_idx] = []
|
||||
history_baseline_dict[cmd_idx] = []
|
||||
history_data_list = []
|
||||
if cmd_idxs:
|
||||
history_data_list = query_history_data(gpu_type)
|
||||
|
||||
# Process history data if we have any
|
||||
if history_data_list:
|
||||
for history_data in history_data_list:
|
||||
for cmd_idx in cmd_idxs:
|
||||
@ -390,7 +435,9 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
|
||||
else:
|
||||
history_data_dict[cmd_idx].append(history_data)
|
||||
break
|
||||
|
||||
# Sometime database has several baselines and we only use the latest baseline one
|
||||
# If list is empty, set to None for each cmd_idx
|
||||
for cmd_idx, baseline_list in history_baseline_dict.items():
|
||||
latest_baseline = get_latest_data(baseline_list)
|
||||
history_baseline_dict[cmd_idx] = latest_baseline
|
||||
@ -430,24 +477,27 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||
2. For Minimize metrics, if new perf is above baseline * (1 + threshold)
|
||||
Set it as regressive.
|
||||
"""
|
||||
# If history_baseline_dict is None (network failure), skip regression check
|
||||
if history_baseline_dict is None:
|
||||
return []
|
||||
|
||||
regressive_data_list = []
|
||||
cmd_idxs = new_data_dict.keys()
|
||||
# Find regressive test cases
|
||||
for cmd_idx in cmd_idxs:
|
||||
for cmd_idx in new_data_dict:
|
||||
if history_baseline_dict[cmd_idx] is None:
|
||||
continue
|
||||
|
||||
baseline_data = history_baseline_dict[cmd_idx]
|
||||
history_baseline = history_baseline_dict[cmd_idx]
|
||||
new_data = new_data_dict[cmd_idx]
|
||||
is_regressive = False
|
||||
regressive_metrics = []
|
||||
|
||||
# Check MAXIMIZE_METRICS (new should be >= baseline * (1 - threshold))
|
||||
for metric in MAXIMIZE_METRICS:
|
||||
if metric not in new_data or metric not in baseline_data:
|
||||
if metric not in new_data or metric not in history_baseline:
|
||||
continue
|
||||
threshold = get_threshold(baseline_data, metric)
|
||||
baseline_value = baseline_data[metric]
|
||||
threshold = get_threshold(history_baseline, metric)
|
||||
baseline_value = history_baseline[metric]
|
||||
new_value = new_data[metric]
|
||||
# Regressive if new_value < baseline_value * (1 - threshold)
|
||||
if new_value < baseline_value * (1 - threshold):
|
||||
@ -456,10 +506,10 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||
|
||||
# Check MINIMIZE_METRICS (new should be <= baseline * (1 + threshold))
|
||||
for metric in MINIMIZE_METRICS:
|
||||
if metric not in new_data or metric not in baseline_data:
|
||||
if metric not in new_data or metric not in history_baseline:
|
||||
continue
|
||||
threshold = get_threshold(baseline_data, metric)
|
||||
baseline_value = baseline_data[metric]
|
||||
threshold = get_threshold(history_baseline, metric)
|
||||
baseline_value = history_baseline[metric]
|
||||
new_value = new_data[metric]
|
||||
# Regressive if new_value > baseline_value * (1 + threshold)
|
||||
if new_value > baseline_value * (1 + threshold):
|
||||
@ -471,9 +521,9 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||
regressive_data = new_data.copy()
|
||||
# Add baseline values and thresholds for all metrics
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
if metric in baseline_data:
|
||||
if metric in history_baseline:
|
||||
baseline_key = f"d_baseline_{metric[2:]}"
|
||||
regressive_data[baseline_key] = baseline_data[metric]
|
||||
regressive_data[baseline_key] = history_baseline[metric]
|
||||
|
||||
# Copy all threshold keys from baseline
|
||||
metric_suffix = metric[2:]
|
||||
@ -482,8 +532,8 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||
f"d_threshold_post_merge_{metric_suffix}",
|
||||
f"d_threshold_pre_merge_{metric_suffix}"
|
||||
]:
|
||||
if threshold_key in baseline_data:
|
||||
regressive_data[threshold_key] = baseline_data[
|
||||
if threshold_key in history_baseline:
|
||||
regressive_data[threshold_key] = history_baseline[
|
||||
threshold_key]
|
||||
|
||||
# Add regression info string
|
||||
@ -495,11 +545,24 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
|
||||
return regressive_data_list
|
||||
|
||||
|
||||
def prepare_baseline_data(history_data_dict, new_data_dict):
|
||||
def _is_valid_baseline(baseline_data):
|
||||
"""Check if baseline data is valid (non-empty dict)."""
|
||||
if isinstance(baseline_data, dict) and len(baseline_data) > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def prepare_baseline_data(history_baseline_dict, history_data_dict,
|
||||
new_data_dict):
|
||||
"""
|
||||
Calculate new baseline from history post-merge data and new data.
|
||||
Then return new baseline data.
|
||||
"""
|
||||
# If history_baseline_dict and history_data_dict are None (network failure),
|
||||
# return None to indicate we cannot prepare baseline data
|
||||
if history_baseline_dict is None and history_data_dict is None:
|
||||
return {}
|
||||
|
||||
new_baseline_data_dict = {}
|
||||
cmd_idxs = new_data_dict.keys()
|
||||
# Find the best history post-merge data for each cmd
|
||||
@ -507,18 +570,42 @@ def prepare_baseline_data(history_data_dict, new_data_dict):
|
||||
# Calculate best metrics from history post-merge data and new data
|
||||
best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx],
|
||||
new_data_dict[cmd_idx])
|
||||
|
||||
# Create new_baseline_data from new_data_dict and set b_is_baseline
|
||||
new_baseline_data = new_data_dict[cmd_idx].copy()
|
||||
new_baseline_data["b_is_baseline"] = True
|
||||
# Add or update baseline metrics and thresholds
|
||||
for metric, value in best_metrics.items():
|
||||
new_baseline_data[metric] = value
|
||||
|
||||
# Initialize metric_threshold_dict with default thresholds for all metrics
|
||||
metric_threshold_dict = {}
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
metric_suffix = metric[2:]
|
||||
post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
|
||||
pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
|
||||
new_baseline_data[post_merge_key] = new_baseline_data.get(
|
||||
post_merge_key, POST_MERGE_THRESHOLD)
|
||||
new_baseline_data[pre_merge_key] = new_baseline_data.get(
|
||||
pre_merge_key, PRE_MERGE_THRESHOLD)
|
||||
metric_threshold_dict[post_merge_key] = POST_MERGE_THRESHOLD
|
||||
metric_threshold_dict[pre_merge_key] = PRE_MERGE_THRESHOLD
|
||||
|
||||
# If history baseline is valid, extract thresholds and update metric_threshold_dict
|
||||
history_baseline = history_baseline_dict[cmd_idx]
|
||||
if _is_valid_baseline(history_baseline):
|
||||
for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
|
||||
metric_suffix = metric[2:]
|
||||
post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
|
||||
pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
|
||||
if post_merge_key in history_baseline:
|
||||
metric_threshold_dict[post_merge_key] = history_baseline[
|
||||
post_merge_key]
|
||||
if pre_merge_key in history_baseline:
|
||||
metric_threshold_dict[pre_merge_key] = history_baseline[
|
||||
pre_merge_key]
|
||||
|
||||
# Update new_baseline_data with best_metrics values
|
||||
for metric, value in best_metrics.items():
|
||||
new_baseline_data[metric] = value
|
||||
|
||||
# Add all thresholds to new_baseline_data
|
||||
for threshold_key, threshold_value in metric_threshold_dict.items():
|
||||
new_baseline_data[threshold_key] = threshold_value
|
||||
|
||||
add_id(new_baseline_data)
|
||||
new_baseline_data_dict[cmd_idx] = new_baseline_data
|
||||
|
||||
@ -539,7 +626,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
|
||||
if cmd_idx in new_data_dict:
|
||||
data_list.append(new_data_dict[cmd_idx])
|
||||
# Only post regressive test cases when post-merge.
|
||||
if new_baseline_data_dict:
|
||||
# new_baseline_data_dict is None means pre-merge.
|
||||
if new_baseline_data_dict and regressive_data_list:
|
||||
data_list.extend(regressive_data_list)
|
||||
if not data_list:
|
||||
return
|
||||
|
||||
@ -148,8 +148,8 @@ def main():
|
||||
job_workspace = sys.argv[1]
|
||||
|
||||
if not os.path.isdir(job_workspace):
|
||||
print(f"Error: {job_workspace} is not a valid directory")
|
||||
sys.exit(1)
|
||||
print(f"Skipping perf regression check since {job_workspace} is not a valid directory.")
|
||||
return 0
|
||||
|
||||
perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
|
||||
all_perf_data = read_yaml_data(perf_data_files)
|
||||
@ -171,15 +171,33 @@ def main():
|
||||
print("=" * 60)
|
||||
print_regression_data(data)
|
||||
|
||||
# Split regression data into post-merge and pre-merge categories
|
||||
post_merge_regressions = [
|
||||
data for data in all_regression_data if data.get("b_is_post_merge", False)
|
||||
]
|
||||
pre_merge_regressions = [
|
||||
data for data in all_regression_data if not data.get("b_is_post_merge", False)
|
||||
]
|
||||
|
||||
if len(all_regression_data) == 0:
|
||||
print("\n No regression data found. Perf check is successful.")
|
||||
return 0
|
||||
else:
|
||||
|
||||
if len(pre_merge_regressions) > 0:
|
||||
print(
|
||||
f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed."
|
||||
f"\n Warning: Found {len(pre_merge_regressions)} pre-merge regression data. "
|
||||
"But we don't fail the check temporarily."
|
||||
)
|
||||
|
||||
if len(post_merge_regressions) > 0:
|
||||
print(
|
||||
f"\n Error: Found {len(post_merge_regressions)} post-merge regression data. Perf check is failed."
|
||||
)
|
||||
return 1
|
||||
|
||||
print("\n No post-merge regression data found. Perf check is successful.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
1474
tests/integration/defs/perf/test_perf_sanity.py
Normal file
1474
tests/integration/defs/perf/test_perf_sanity.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -19,22 +19,17 @@ import io
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, NamedTuple, Optional
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
from _pytest.nodes import Item
|
||||
from _pytest.python import Function
|
||||
from defs.trt_test_alternative import (check_output, popen, print_error,
|
||||
print_info)
|
||||
from test_common.http_utils import wait_for_endpoint_ready
|
||||
|
||||
from tensorrt_llm._utils import get_free_port
|
||||
|
||||
from ..common import get_trt_llm_lib_dir, venv_mpi_check_output
|
||||
from ..local_venv import PythonVenvRunnerImpl
|
||||
from ..test_list_parser import parse_test_list
|
||||
@ -243,55 +238,6 @@ class PerfBenchScriptTestCmds(NamedTuple):
|
||||
return cmd_str
|
||||
|
||||
|
||||
class PerfAggrScriptTestCmds(NamedTuple):
|
||||
server_cmds: List[List[str]]
|
||||
client_cmds: List[List[str]]
|
||||
names: List[str]
|
||||
timeout: int
|
||||
output_dir: str
|
||||
|
||||
def run_cmd(self, cmd_idx: int, venv) -> str:
|
||||
output = ""
|
||||
server_proc = None
|
||||
server_file_path = os.path.join(
|
||||
self.output_dir, f"trtllm-serve.{self.names[cmd_idx]}.log")
|
||||
client_file_path = os.path.join(
|
||||
self.output_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log")
|
||||
try:
|
||||
server_hostname = "localhost"
|
||||
server_port = get_free_port()
|
||||
server_cmd = add_host_port_to_cmd(self.server_cmds[cmd_idx],
|
||||
server_hostname, server_port)
|
||||
print_info(f"Starting server. cmd is {server_cmd}")
|
||||
with open(server_file_path, 'w') as server_ctx:
|
||||
server_proc = subprocess.Popen(
|
||||
server_cmd,
|
||||
stdout=server_ctx,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=copy.deepcopy(os.environ),
|
||||
)
|
||||
wait_for_endpoint_ready(
|
||||
f"http://{server_hostname}:{server_port}/health",
|
||||
timeout=self.timeout)
|
||||
client_cmd = add_host_port_to_cmd(self.client_cmds[cmd_idx],
|
||||
server_hostname, server_port)
|
||||
print_info(f"Starting client. cmd is {client_cmd}")
|
||||
output = subprocess.check_output(
|
||||
client_cmd,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=copy.deepcopy(os.environ),
|
||||
).decode()
|
||||
with open(client_file_path, 'w') as client_ctx:
|
||||
client_ctx.write(output)
|
||||
finally:
|
||||
server_proc.terminate()
|
||||
server_proc.wait()
|
||||
return output
|
||||
|
||||
def get_cmd_str(self, cmd_idx) -> List[str]:
|
||||
return ["aggr_server tests, please check config files"]
|
||||
|
||||
|
||||
class PerfDisaggScriptTestCmds(NamedTuple):
|
||||
ctx_cmd: str
|
||||
gen_cmd: str
|
||||
@ -341,249 +287,6 @@ class PerfDisaggScriptTestCmds(NamedTuple):
|
||||
return ["disaggregated server tests, please check config files"]
|
||||
|
||||
|
||||
class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
|
||||
ctx_server_cmds: List[List[str]]
|
||||
gen_server_cmds: List[List[str]]
|
||||
disagg_server_cmds: List[List[str]]
|
||||
benchmark_cmds: List[List[str]]
|
||||
timeout: int
|
||||
hostname: str
|
||||
disagg_serving_type: str
|
||||
num_ctx_servers: int
|
||||
num_gen_servers: int
|
||||
output_dir: str
|
||||
|
||||
def _generate_hostname_file(self, cmd_idx: int, port: int):
|
||||
# Create hostnames directory
|
||||
hostnames_dir = os.path.join(self.output_dir, f"hostnames-{cmd_idx}")
|
||||
if not os.path.exists(hostnames_dir):
|
||||
os.makedirs(hostnames_dir, exist_ok=True)
|
||||
hostname_file = os.path.join(hostnames_dir,
|
||||
f"{self.disagg_serving_type}.txt")
|
||||
with open(hostname_file, 'w') as f:
|
||||
f.write(f"{self.hostname}:{port}")
|
||||
|
||||
def _generate_disagg_server_config(self, cmd_idx: int,
|
||||
disagg_server_port: int) -> str:
|
||||
print_info(
|
||||
f"Generating disagg server config for command index {cmd_idx}")
|
||||
hostnames_folder = os.path.join(self.output_dir, f"hostnames-{cmd_idx}")
|
||||
expected_count = self.num_ctx_servers + self.num_gen_servers
|
||||
start_time = time.time()
|
||||
hostnames = []
|
||||
while True:
|
||||
elapsed_time = time.time() - start_time
|
||||
print_info(
|
||||
f"Waiting for hostnames in {hostnames_folder}, elapsed time: {elapsed_time}s, current: {len(hostnames)}, expected: {expected_count}"
|
||||
)
|
||||
if elapsed_time > self.timeout:
|
||||
print_error(
|
||||
f"Time out. Hostnames files are not ready after {self.timeout}s"
|
||||
)
|
||||
time.sleep(10)
|
||||
if not os.path.exists(hostnames_folder):
|
||||
continue
|
||||
hostnames = os.listdir(hostnames_folder)
|
||||
if len(hostnames) >= expected_count:
|
||||
break
|
||||
print_info(
|
||||
f"All hostnames found in {hostnames_folder} after elapsed time: {elapsed_time}s"
|
||||
)
|
||||
|
||||
# Read ctx and gen hostnames
|
||||
ctx_hostnames = []
|
||||
gen_hostnames = []
|
||||
for hostname_file in hostnames:
|
||||
hostname_file_path = os.path.join(hostnames_folder, hostname_file)
|
||||
with open(hostname_file_path, 'r') as f:
|
||||
hostname_port = f.read().strip()
|
||||
hostname = hostname_port.split(":")[0]
|
||||
port = hostname_port.split(":")[1]
|
||||
print_info(
|
||||
f"Hostname File: {hostname_file_path} Hostname: {hostname_port} Port: {port}"
|
||||
)
|
||||
if hostname_file.startswith("CTX"):
|
||||
ctx_hostnames.append(hostname_port)
|
||||
elif hostname_file.startswith("GEN"):
|
||||
gen_hostnames.append(hostname_port)
|
||||
|
||||
server_config = {
|
||||
'hostname': self.hostname,
|
||||
'port': disagg_server_port,
|
||||
'backend': 'pytorch',
|
||||
'context_servers': {
|
||||
'num_instances': self.num_ctx_servers,
|
||||
'urls': ctx_hostnames,
|
||||
},
|
||||
'generation_servers': {
|
||||
'num_instances': self.num_gen_servers,
|
||||
'urls': gen_hostnames,
|
||||
}
|
||||
}
|
||||
config_path = os.path.join(self.output_dir,
|
||||
f"server_config.{cmd_idx}.yaml")
|
||||
with open(config_path, 'w') as f:
|
||||
yaml.dump(server_config, f)
|
||||
print_info(f"Server config file {config_path} generated")
|
||||
return config_path
|
||||
|
||||
def _get_disagg_server_hostname_and_port(self, cmd_idx: int) -> tuple:
|
||||
config_path = os.path.join(self.output_dir,
|
||||
f"server_config.{cmd_idx}.yaml")
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if os.path.exists(config_path):
|
||||
print_info(f"Server config file found: {config_path}")
|
||||
break
|
||||
elapsed_time = time.time() - start_time
|
||||
if elapsed_time > self.timeout:
|
||||
print_error(
|
||||
f"Server config file {config_path} not found after {self.timeout}s"
|
||||
)
|
||||
print_info(
|
||||
f"Waiting for server config file, elapsed time: {elapsed_time}s"
|
||||
)
|
||||
time.sleep(10) # Check every 10 seconds
|
||||
|
||||
# Read server config to get hostname and port
|
||||
with open(config_path, 'r') as f:
|
||||
server_config = yaml.safe_load(f)
|
||||
disagg_server_hostname = server_config['hostname']
|
||||
disagg_server_port = server_config['port']
|
||||
return disagg_server_hostname, disagg_server_port
|
||||
|
||||
def wait_for_benchmark_ready(self,
|
||||
benchmark_status_file: str,
|
||||
timeout: int = 7200):
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if os.path.exists(benchmark_status_file):
|
||||
print_info(
|
||||
f"Benchmark status file found, terminating server {self.disagg_serving_type}"
|
||||
)
|
||||
break
|
||||
elapsed_time = time.time() - start_time
|
||||
print_info(
|
||||
f"Waiting for benchmark status file, elapsed time: {elapsed_time}s"
|
||||
)
|
||||
if elapsed_time > timeout:
|
||||
print_error(
|
||||
f"Timeout waiting for benchmark status file after {timeout}s, terminating server {self.disagg_serving_type}"
|
||||
)
|
||||
break
|
||||
time.sleep(10) # Check every 10 seconds
|
||||
|
||||
def wait_for_endpoint_ready(self, url: str, timeout: int = 7200):
|
||||
start = time.monotonic()
|
||||
while True:
|
||||
elapsed_time = time.monotonic() - start
|
||||
if elapsed_time > timeout:
|
||||
print_error(
|
||||
f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds"
|
||||
)
|
||||
break
|
||||
print_info(
|
||||
f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s"
|
||||
)
|
||||
try:
|
||||
time.sleep(10)
|
||||
if requests.get(url).status_code == 200:
|
||||
print_info(f"endpoint {url} is ready")
|
||||
return
|
||||
except Exception as err:
|
||||
print_info(
|
||||
f"endpoint {url} is not ready, with exception: {err}")
|
||||
print_error(
|
||||
f"Endpoint {url} did not become ready within {timeout} seconds")
|
||||
|
||||
def run_cmd(self, cmd_idx: int, venv) -> str:
|
||||
output = ""
|
||||
server_proc = None
|
||||
benchmark_status_file = os.path.join(self.output_dir,
|
||||
f"benchmark_status.{cmd_idx}.txt")
|
||||
port = get_free_port()
|
||||
if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type:
|
||||
self._generate_hostname_file(cmd_idx, port)
|
||||
server_file_path = os.path.join(
|
||||
self.output_dir,
|
||||
f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
|
||||
is_ctx = "CTX" in self.disagg_serving_type
|
||||
server_cmd = self.ctx_server_cmds[
|
||||
cmd_idx] if is_ctx else self.gen_server_cmds[cmd_idx]
|
||||
server_cmd = add_host_port_to_cmd(server_cmd, self.hostname, port)
|
||||
try:
|
||||
print_info(
|
||||
f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd}"
|
||||
)
|
||||
with open(server_file_path, 'w') as server_ctx:
|
||||
server_proc = subprocess.Popen(
|
||||
server_cmd,
|
||||
stdout=server_ctx,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=copy.deepcopy(os.environ),
|
||||
)
|
||||
self.wait_for_benchmark_ready(benchmark_status_file,
|
||||
timeout=self.timeout)
|
||||
finally:
|
||||
print_info(f"Server {self.disagg_serving_type} stopped")
|
||||
server_proc.terminate()
|
||||
server_proc.wait()
|
||||
elif self.disagg_serving_type == "DISAGG_SERVER":
|
||||
disagg_server_file_path = os.path.join(
|
||||
self.output_dir,
|
||||
f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
|
||||
disagg_server_cmd = self.disagg_server_cmds[cmd_idx]
|
||||
try:
|
||||
self._generate_disagg_server_config(cmd_idx, port)
|
||||
print_info(
|
||||
f"Starting disagg server. disagg_serving_type: {self.disagg_serving_type} disagg server cmd is {disagg_server_cmd}"
|
||||
)
|
||||
with open(disagg_server_file_path, 'w') as disagg_server_ctx:
|
||||
disagg_server_proc = subprocess.Popen(
|
||||
disagg_server_cmd,
|
||||
stdout=disagg_server_ctx,
|
||||
stderr=subprocess.STDOUT,
|
||||
env=copy.deepcopy(os.environ),
|
||||
)
|
||||
self.wait_for_benchmark_ready(benchmark_status_file,
|
||||
timeout=self.timeout)
|
||||
finally:
|
||||
print_info(f"Disagg server {self.disagg_serving_type} stopped")
|
||||
disagg_server_proc.terminate()
|
||||
disagg_server_proc.wait()
|
||||
elif self.disagg_serving_type == "BENCHMARK":
|
||||
benchmark_file_path = os.path.join(
|
||||
self.output_dir, f"trtllm-benchmark.{cmd_idx}.log")
|
||||
try:
|
||||
disagg_server_hostname, disagg_server_port = self._get_disagg_server_hostname_and_port(
|
||||
cmd_idx)
|
||||
benchmark_cmd = add_host_port_to_cmd(
|
||||
self.benchmark_cmds[cmd_idx], disagg_server_hostname,
|
||||
disagg_server_port)
|
||||
self.wait_for_endpoint_ready(
|
||||
f"http://{disagg_server_hostname}:{disagg_server_port}/health",
|
||||
timeout=self.timeout,
|
||||
)
|
||||
print_info(
|
||||
f"Starting benchmark. disagg_serving_type: {self.disagg_serving_type} benchmark cmd is {benchmark_cmd}"
|
||||
)
|
||||
output = subprocess.check_output(
|
||||
benchmark_cmd,
|
||||
env=copy.deepcopy(os.environ),
|
||||
stderr=subprocess.STDOUT).decode()
|
||||
with open(benchmark_file_path, 'w') as benchmark_ctx:
|
||||
benchmark_ctx.write(output)
|
||||
finally:
|
||||
with open(benchmark_status_file, 'w') as status_file:
|
||||
status_file.write("Done")
|
||||
return output
|
||||
|
||||
def get_cmd_str(self, cmd_idx) -> List[str]:
|
||||
return [
|
||||
"multi-node disaggregated server tests, please check config files"
|
||||
]
|
||||
|
||||
|
||||
class AbstractPerfScriptTestClass(abc.ABC):
|
||||
"""
|
||||
Abstract class for all script-based perf tests.
|
||||
@ -715,14 +418,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
||||
|
||||
cmd_str = commands.get_cmd_str(cmd_idx)
|
||||
is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str
|
||||
is_perf_sanity_test = "perf_sanity" in full_test_name
|
||||
|
||||
is_disagg_server = False
|
||||
if self._config.runtime == "multi_node_disagg_server":
|
||||
disagg_serving_type = self._config.disagg_configs[0][
|
||||
'disagg_serving_type']
|
||||
is_disagg_server = disagg_serving_type != "BENCHMARK"
|
||||
|
||||
# Start the timer.
|
||||
self._start_timestamp = datetime.utcnow()
|
||||
try:
|
||||
@ -730,8 +425,7 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
||||
# Capture the stdout from _gpu_clock_lock because the pipeline JUnit update script tries to parse
|
||||
# the log to find the GPU clocks.
|
||||
with io.StringIO() as buf:
|
||||
# Perf-sanity test doesn't lock gpu clock
|
||||
if self._gpu_clock_lock and not is_perf_sanity_test:
|
||||
if self._gpu_clock_lock:
|
||||
# Lock GPU clock and start monitoring.
|
||||
with contextlib.redirect_stdout(
|
||||
buf), self._gpu_clock_lock, tmpDir:
|
||||
@ -746,7 +440,7 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
||||
print(collect_and_clean_myelin_time(output))
|
||||
|
||||
# Check whether output has error message
|
||||
if not is_prepare_dataset_cmd and is_perf_sanity_test:
|
||||
if not is_prepare_dataset_cmd:
|
||||
self._check_benchmark_output_for_errors(output)
|
||||
|
||||
# Print the output log to stdout and cache it.
|
||||
@ -793,10 +487,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
||||
f"skip writing perf result when calling generating dataset in trtllm-bench."
|
||||
)
|
||||
outputs.pop(cmd_idx)
|
||||
elif is_disagg_server:
|
||||
print_info(
|
||||
f"skip writing perf result when running disagg's worker or server."
|
||||
)
|
||||
else:
|
||||
self._perf_result = self.get_perf_result(outputs)
|
||||
|
||||
@ -818,11 +508,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
|
||||
Store the test results in the _test_results.
|
||||
Write the test results and GPU monitoring data to the output csv and/or yaml files.
|
||||
"""
|
||||
# Store the test result
|
||||
if cmd_idx not in self._test_results:
|
||||
self._test_results[cmd_idx] = {}
|
||||
self._test_results[cmd_idx][metric_type] = self._perf_result
|
||||
|
||||
# Get GPU monitoring data
|
||||
self._gpu_monitor_data = self._gpu_clock_lock.get_state_data()
|
||||
if not self._gpu_monitor_data:
|
||||
|
||||
@ -15,9 +15,9 @@ l0_dgx_b200_perf_sanity:
|
||||
backend: pytorch
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
||||
|
||||
- condition:
|
||||
ranges:
|
||||
@ -34,8 +34,6 @@ l0_dgx_b200_perf_sanity:
|
||||
backend: pytorch
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
||||
|
||||
@ -16,9 +16,9 @@ l0_dgx_b300_perf_sanity:
|
||||
backend: pytorch
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
|
||||
|
||||
- condition:
|
||||
ranges:
|
||||
@ -36,6 +36,6 @@ l0_dgx_b300_perf_sanity:
|
||||
backend: pytorch
|
||||
orchestrator: mpi
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
|
||||
|
||||
@ -1,5 +1,33 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_gpus_perf_sanity:
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
lte: 4
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
linux_distribution_name: ubuntu*
|
||||
cpu: aarch64
|
||||
terms:
|
||||
stage: pre_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -14,6 +42,17 @@ l0_gb200_multi_gpus_perf_sanity:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
|
||||
|
||||
@ -0,0 +1,17 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 2 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
lte: 8
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] TIMEOUT (90)
|
||||
@ -1,16 +0,0 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001:
|
||||
- condition:
|
||||
ranges:
|
||||
# 2 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
lte: 8
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1]
|
||||
@ -1,5 +1,5 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 3 nodes with each node has 4 GPUs
|
||||
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (90)
|
||||
@ -0,0 +1,17 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 6 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 24
|
||||
lte: 24
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (90)
|
||||
@ -1,16 +0,0 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001:
|
||||
- condition:
|
||||
ranges:
|
||||
# 6 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 24
|
||||
lte: 24
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
|
||||
@ -1,16 +0,0 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002:
|
||||
- condition:
|
||||
ranges:
|
||||
# 6 nodes with each node has 4 GPUs
|
||||
system_gpu_count:
|
||||
gte: 24
|
||||
lte: 24
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*gb200*'
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (180)
|
||||
@ -1,5 +1,5 @@
|
||||
version: 0.0.1
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
|
||||
l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes:
|
||||
- condition:
|
||||
ranges:
|
||||
# 8 nodes with each node has 4 GPUs
|
||||
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
|
||||
- perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
|
||||
@ -11,24 +11,69 @@ Performance sanity testing scripts for TensorRT-LLM with configuration-driven te
|
||||
|
||||
## Configuration File Types
|
||||
|
||||
There are three types of YAML config files for different deployment architectures.
|
||||
Aggregated config files are in [`tests/scripts/perf-sanity`](./).
|
||||
Disaggregated config files are in [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf).
|
||||
There are two modes for perf sanity tests: aggregated (aggr) and disaggregated (disagg).
|
||||
|
||||
### 1. Single-Node Aggregated Test Configuration
|
||||
### Aggregated Mode (aggr)
|
||||
|
||||
**File Example**: `deepseek_r1_fp4_v2_grace_blackwell.yaml`
|
||||
**Config Location**: [`tests/scripts/perf-sanity`](./)
|
||||
|
||||
**Use Case**: Single-node performance tests on a single server with multiple GPUs.
|
||||
**File Naming**: `xxx.yaml` where words are connected by `_` (underscore), not `-` (hyphen).
|
||||
|
||||
### 2. Multi-Node Aggregated Test Configuration
|
||||
**File Examples**:
|
||||
- `deepseek_r1_fp4_v2_grace_blackwell.yaml` - Single-node aggregated test
|
||||
- `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml` - Multi-node aggregated test
|
||||
|
||||
**File Example**: `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml`
|
||||
**Use Cases**:
|
||||
- Single-node: Performance tests on a single server with multiple GPUs
|
||||
- Multi-node: Model runs across multiple nodes with unified execution
|
||||
|
||||
**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
|
||||
**Test Case Names**:
|
||||
```
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-{config yaml file base name}]
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-{config yaml file base name}-{server_config_name}]
|
||||
```
|
||||
|
||||
### 3. Multi-Node Disaggregated Test Configuration
|
||||
- Without server config name: runs all server configs in the YAML file
|
||||
- With server config name: runs only the specified server config (the `name` field in `server_configs`)
|
||||
|
||||
**Examples**:
|
||||
```
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell]
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
|
||||
perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
|
||||
```
|
||||
|
||||
### Disaggregated Mode (disagg)
|
||||
|
||||
**Config Location**: [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf)
|
||||
|
||||
**File Naming**: `xxx.yaml` (can contain `-` hyphen).
|
||||
|
||||
**File Example**: `deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml`
|
||||
|
||||
**Use Case**: Disaggregated architecture where model runs across multiple nodes with separate context (prefill) and generation (decode) servers.
|
||||
|
||||
**Test Case Name**:
|
||||
```
|
||||
perf/test_perf_sanity.py::test_e2e[disagg_upload-{config yaml file base name}]
|
||||
```
|
||||
|
||||
**Example**:
|
||||
```
|
||||
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX]
|
||||
```
|
||||
|
||||
## Running Tests
|
||||
|
||||
**Important**: Do NOT add `--perf` flag when running pytest. Perf sanity tests are static test cases and do not use perf mode.
|
||||
|
||||
```bash
|
||||
# Run all server configs in an aggregated test
|
||||
pytest perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell]
|
||||
|
||||
# Run a specific server config in an aggregated test
|
||||
pytest perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
|
||||
|
||||
# Run a specific disaggregated test
|
||||
pytest perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX]
|
||||
```
|
||||
|
||||
@ -35,8 +35,9 @@ server_configs:
|
||||
iterations: 12
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tep8_mtp3"
|
||||
model_name: "deepseek_r1_0528_fp4_v2"
|
||||
trust_remote_code: true
|
||||
@ -64,5 +65,5 @@ server_configs:
|
||||
iterations: 12
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
@ -30,12 +30,12 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con2048_iter10_1k1k"
|
||||
- name: "con2048_iter5_1k1k"
|
||||
concurrency: 2048
|
||||
iterations: 10
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
||||
@ -65,7 +65,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
||||
@ -95,5 +95,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
@ -31,12 +31,12 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con2048_iter10_1k1k"
|
||||
- name: "con2048_iter5_1k1k"
|
||||
concurrency: 2048
|
||||
iterations: 10
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k1k"
|
||||
@ -66,7 +66,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k1k"
|
||||
@ -96,7 +96,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
# 8k1k configs
|
||||
@ -126,12 +126,12 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con2048_iter10_8k1k"
|
||||
- name: "con2048_iter5_8k1k"
|
||||
concurrency: 2048
|
||||
iterations: 10
|
||||
iterations: 5
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tep4_mtp3_8k1k"
|
||||
@ -161,7 +161,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tp4_mtp3_8k1k"
|
||||
@ -191,7 +191,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 8192
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
# 1k8k configs
|
||||
@ -221,12 +221,12 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con2048_iter10_1k8k"
|
||||
- name: "con2048_iter5_1k8k"
|
||||
concurrency: 2048
|
||||
iterations: 10
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tep4_mtp3_1k8k"
|
||||
@ -256,7 +256,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp4_v2_tp4_mtp3_1k8k"
|
||||
@ -286,5 +286,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
@ -30,12 +30,12 @@ server_configs:
|
||||
decoding_type: 'MTP'
|
||||
num_nextn_predict_layers: 1
|
||||
client_configs:
|
||||
- name: "con4096_iter10_1k1k"
|
||||
- name: "con4096_iter5_1k1k"
|
||||
concurrency: 4096
|
||||
iterations: 10
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp8_tep8_mtp3_1k1k"
|
||||
@ -65,7 +65,7 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "r1_fp8_tp8_mtp3_1k1k"
|
||||
@ -95,5 +95,5 @@ server_configs:
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
@ -4,6 +4,37 @@ metadata:
|
||||
- B200
|
||||
- B300
|
||||
server_configs:
|
||||
- name: "gpt_oss_fp4_dep4_1k8k"
|
||||
model_name: "gpt_oss_120b_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 640
|
||||
max_num_tokens: 20000
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
enable_balance: true
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 640
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
client_configs:
|
||||
- name: "con2560_iter5_1k8k"
|
||||
concurrency: 2560
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_dep2_1k1k"
|
||||
model_name: "gpt_oss_120b_fp4"
|
||||
tensor_parallel_size: 2
|
||||
@ -32,25 +63,23 @@ server_configs:
|
||||
iterations: 5
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_dep4_1k1k"
|
||||
- name: "gpt_oss_fp4_tep2_1k8k"
|
||||
model_name: "gpt_oss_120b_fp4"
|
||||
tensor_parallel_size: 4
|
||||
moe_expert_parallel_size: 4
|
||||
tensor_parallel_size: 2
|
||||
moe_expert_parallel_size: 2
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 512
|
||||
max_batch_size: 128
|
||||
max_num_tokens: 20000
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: true
|
||||
attention_dp_config:
|
||||
enable_balance: true
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 512
|
||||
max_batch_size: 128
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
@ -58,12 +87,41 @@ server_configs:
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
client_configs:
|
||||
- name: "con2048_iter5_1k1k"
|
||||
concurrency: 2048
|
||||
iterations: 5
|
||||
- name: "con128_iter10_1k8k"
|
||||
concurrency: 128
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_tp2_1k8k"
|
||||
model_name: "gpt_oss_120b_fp4"
|
||||
tensor_parallel_size: 2
|
||||
moe_expert_parallel_size: 1
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 20000
|
||||
attn_backend: "TRTLLM"
|
||||
enable_attention_dp: false
|
||||
moe_config:
|
||||
backend: 'TRTLLM'
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
max_batch_size: 8
|
||||
kv_cache_config:
|
||||
dtype: 'fp8'
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.8
|
||||
num_postprocess_workers: 4
|
||||
stream_interval: 20
|
||||
client_configs:
|
||||
- name: "con8_iter10_1k8k"
|
||||
concurrency: 8
|
||||
iterations: 10
|
||||
isl: 1024
|
||||
osl: 8192
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
|
||||
- name: "gpt_oss_fp4_tp4_eagle3_1k1k"
|
||||
@ -97,5 +155,5 @@ server_configs:
|
||||
iterations: 32
|
||||
isl: 1024
|
||||
osl: 1024
|
||||
random_range_ratio: 0.2
|
||||
random_range_ratio: 0.8
|
||||
backend: "openai"
|
||||
Loading…
Reference in New Issue
Block a user