[None][ci] Some tweaks for the CI pipeline (#10359)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2026-01-05 00:10:47 +08:00 committed by GitHub
parent afc533193d
commit c4f27fa4c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 183 additions and 161 deletions

View File

@ -585,7 +585,7 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
parallelJobs.failFast = enableFailFast
if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) {
def key = "Build with build type Debug"
def key = "Build With Build Type Debug"
parallelJobs += [
(key): {
script {
@ -628,7 +628,7 @@ pipeline {
HF_DATASETS_OFFLINE=1
}
stages {
stage("BuildJob") {
stage("Build Job") {
steps {
launchStages(this, params.targetArch, params.enableFailFast, globalVars)
}

View File

@ -276,7 +276,7 @@ def buildImage(config, imageKeyToTag)
}
// Step 2: Build the images
stage ("Install packages") {
stage ("Install Package") {
sh "pwd && ls -alh"
sh "env | sort"
sh "apk add make git"
@ -380,7 +380,7 @@ def buildImage(config, imageKeyToTag)
}
if (customTag) {
stage ("custom tag: ${customTag} (${arch})") {
stage ("Custom Tag: ${customTag} (${arch})") {
sh """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
BASE_IMAGE=${BASE_IMAGE} \
@ -395,7 +395,7 @@ def buildImage(config, imageKeyToTag)
} catch (Exception ex) {
containerGenFailure = ex
} finally {
stage ("Docker logout") {
stage ("Docker Logout") {
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
sh "docker logout urm.nvidia.com"
sh "docker logout ${DEFAULT_GIT_URL}:5005"
@ -424,14 +424,14 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
def release_action = params.action
def buildConfigs = [
"Build trtllm release (x86_64)": [
"Build Internal release (x86_64 trtllm)": [
target: "trtllm",
action: release_action,
customTag: LLM_BRANCH_TAG + "-x86_64",
build_wheel: true,
dockerfileStage: "release",
],
"Build trtllm release (SBSA)": [
"Build Internal release (SBSA trtllm)": [
target: "trtllm",
action: release_action,
customTag: LLM_BRANCH_TAG + "-sbsa",
@ -439,21 +439,21 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
arch: "arm64",
dockerfileStage: "release",
],
"Build CI image (x86_64 tritondevel)": [:],
"Build CI image (SBSA tritondevel)": [
"Build CI Image (x86_64 tritondevel)": [:],
"Build CI Image (SBSA tritondevel)": [
arch: "arm64",
],
"Build CI image (RockyLinux8 Python310)": [
"Build CI Image (RockyLinux8 Python310)": [
target: "rockylinux8",
args: "PYTHON_VERSION=3.10.12",
postTag: "-py310",
],
"Build CI image (RockyLinux8 Python312)": [
"Build CI Image (RockyLinux8 Python312)": [
target: "rockylinux8",
args: "PYTHON_VERSION=3.12.3",
postTag: "-py312",
],
"Build NGC devel and release (x86_64)": [
"Build NGC devel And release (x86_64)": [
target: "ngc-release",
action: release_action,
args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
@ -464,7 +464,7 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
],
dockerfileStage: "release",
],
"Build NGC devel and release (SBSA)": [
"Build NGC devel And release (SBSA)": [
target: "ngc-release",
action: release_action,
args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
@ -583,7 +583,7 @@ pipeline {
}
}
}
stage("Upload Artifacts") {
stage("Upload Artifact") {
steps {
script {
String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
@ -594,7 +594,7 @@ pipeline {
}
}
}
stage("Wait for Build Jobs Complete") {
stage("Wait For Build Job Complete") {
when {
expression {
RUN_SANITY_CHECK
@ -655,7 +655,7 @@ pipeline {
}
}
}
stage("Sanity Check for NGC Images") {
stage("Sanity Check For NGC Image") {
when {
expression {
RUN_SANITY_CHECK
@ -691,7 +691,7 @@ pipeline {
}
}
}
stage("Register NGC Images for Security Checks") {
stage("Register NGC Image For Security Check") {
when {
expression {
return params.nspect_id && params.action == "push"

View File

@ -451,7 +451,7 @@ def launchReleaseCheck(pipeline)
}
def image = "urm.nvidia.com/docker/golang:1.22"
stageName = "Release Check"
stageName = "Release-Check"
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
stage("[${stageName}] Run") {
if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
@ -834,7 +834,7 @@ def collectTestResults(pipeline, testFilter)
{
collectResultPodSpec = createKubernetesPodConfig("", "agent")
trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", {
stage ("Collect test result") {
stage ("Collect Test Result") {
sh "rm -rf **/*.xml *.tar.gz"
testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results"
@ -864,7 +864,7 @@ def collectTestResults(pipeline, testFilter)
junit(testResults: '**/results*.xml', allowEmptyResults : true)
} // Collect test result stage
stage("Rerun report") {
stage("Rerun Report") {
sh "rm -rf rerun && mkdir -p rerun"
sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true"
sh "find rerun -type f"
@ -904,7 +904,7 @@ def collectTestResults(pipeline, testFilter)
}
} // Rerun report stage
try {
stage("Test coverage") {
stage("Test Coverage") {
sh "ls"
def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","")
sh "echo ${CUR_PATH}"
@ -1030,14 +1030,15 @@ def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64"
def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
{
stages = [
"Release Check": {
"Release-Check": {
script {
launchReleaseCheck(this)
}
},
"x86_64-linux": {
"x86_64-Linux": {
script {
stage("Build") {
def testStageName = "[Build-x86_64] ${env.localJobCredentials ? "Remote Run" : "Run"}"
stage(testStageName) {
def additionalParameters = [
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
@ -1045,7 +1046,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
]
launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
}
def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
def singleGpuTestFailed = false
stage(testStageName) {
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
@ -1135,24 +1137,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
}
}
},
"SBSA-linux": {
"SBSA-Linux": {
script {
def jenkinsUrl = ""
def credentials = ""
def testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
def singleGpuTestFailed = false
if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run"
return
}
stage("Build") {
def testStageName = "[Build-SBSA] ${env.localJobCredentials ? "Remote Run" : "Run"}"
stage(testStageName) {
def additionalParameters = [
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
]
launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
}
testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
def singleGpuTestFailed = false
stage(testStageName) {
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
echo "SBSA test job is skipped due to Jenkins configuration"
@ -1269,9 +1270,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
echo "Will run Build-Docker-Images job"
stages.remove("x86_64-linux")
stages.remove("SBSA-linux")
echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled."
stages.remove("x86_64-Linux")
stages.remove("SBSA-Linux")
echo "Build-Docker-Images job is set explicitly. Both x86_64-Linux and SBSA-Linux sub-pipelines will be disabled."
}
parallelJobs = stages.collectEntries{key, value -> [key, {
@ -1339,11 +1340,11 @@ pipeline {
}
}
}
stage("Build and Test") {
stage("Build And Test") {
steps {
script {
if (isReleaseCheckMode) {
stage("Release Check") {
stage("Release-Check") {
script {
launchReleaseCheck(this)
}

View File

@ -125,7 +125,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
def hasTimeoutTest = false
def downloadResultSucceed = false
pipeline.stage('Submit Test Results') {
pipeline.stage('Submit Test Result') {
sh "mkdir -p ${stageName}"
// Download timeout test results
def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt"
@ -554,7 +554,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
]
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
stage('Request Node via SLURM') {
stage('Request Node Via Slurm') {
println("Selected Cluster: ${cluster.name}")
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)
@ -603,7 +603,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
}
}
stage('Checking if the Node is Online') {
stage('Check If Node Is Online') {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
@ -696,20 +696,18 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
}
slurmRunner = null
echo "${stageName} Slurm partition timeout: ${partition.time}"
def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
if (cluster.containerRuntime.toString() == "DOCKER") {
echo "${stageName} partitionTimeout: ${partition.time}"
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
} else if (cluster.containerRuntime.toString() == "ENROOT") {
echo "${stageName} partitionTimeout: ${partition.time}"
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
} else {
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
}
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
} finally {
stage("Clean up SLURM Resources") {
stage("Clean Up Slurm Resource") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
@ -805,7 +803,7 @@ def getPytestBaseCommandLine(
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
"COLUMNS=400",
"COLUMNS=300",
extraInternalEnv,
portEnvVars,
pytestUtil,
@ -893,9 +891,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Create a unique suffix for the job name
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
def perfSanityMode = stageName.contains("PerfSanity")
def disaggMode = stageName.contains("PerfSanity-Disagg")
def setSegment = disaggMode
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@ -933,19 +929,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def sbatchLogPath = "${jobWorkspace}/job-output.log"
def slurmJobLogPath = "${jobWorkspace}/job-output.log"
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
def isAarch64 = config.contains("aarch64")
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
stage("[${stageName}] Initializing Test") {
stage("Initialize Test") {
println("Selected Cluster: ${cluster.name}")
// Create Job Workspace folder in Frontend Node
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)
@ -1052,7 +1046,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Generate Job Launch Script
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
def mounts = getMountListForSlurmTest(cluster, true).join(",")
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, disaggMode)
if (taskArgs == null) {
error "Invalid Slurm test stage name is set"
}
@ -1140,7 +1134,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptLaunchPrefix = """#!/bin/bash
#SBATCH ${exemptionComment}
#SBATCH --output=${sbatchLogPath}
#SBATCH --output=${slurmJobLogPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs}
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@ -1182,8 +1176,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")
// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
sh """
@ -1218,8 +1210,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptLaunchPathNode,
true
)
def filesToKeepWhenRetry = [
scriptRunPathNode,
scriptInstallPathNode,
scriptBashUtilsPathNode,
scriptLaunchPathNode,
scriptSubmitPathNode,
scriptTrackPathNode,
testListPathNode,
waivesListPathNode,
coverageConfigFile
]
def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("")
def scriptSubmit = """#!/bin/bash
set -Eeuo pipefail
set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
# Clean up previous job intermediate files so that retry can work
@ -1227,26 +1233,26 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
echo "Found previous Slurm job ID: \${previous_job_id}"
scancel "\${previous_job_id}" || true
rm -rf "${jobWorkspace}/slurm_job_id.txt"
# Wait for 60 seconds to ensure the previous job is canceled
sleep 60
# Wait for 120 seconds to ensure the previous job is canceled
sleep 120
fi
rm -rf "${jobWorkspace}/results.xml"
rm -rf "${jobWorkspace}/report.csv"
rm -rf "${jobWorkspace}/unfinished_test.txt"
rm -rf "${sbatchLogPath}"
touch ${sbatchLogPath}
# Clean up workspace: remove all files/dirs not in the keep list
find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} +
touch ${slurmJobLogPath}
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
if [ -z "\$jobId" ]; then
echo "Error: Slurm job submission failed, no job ID returned."
exit 1
fi
echo "Submitted Slurm job \$jobId"
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
echo \$jobId > $jobWorkspace/slurm_job_id.txt
# Save Slurm job ID for later steps to retrieve
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
""".replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
Utils.exec(pipeline, script: "echo \"Script to submit the final Slurm job: \" && cat ${scriptSubmitPathLocal}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
@ -1255,8 +1261,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
true
)
}
stage("[${stageName}] Run Pytest") {
// Submit the sbatch job
// Submit the Slurm job
Utils.exec(
pipeline,
timeout: false,
@ -1266,42 +1273,56 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
),
numRetries: 3
)
def sbatchJobId = Utils.exec(
def slurmJobId = Utils.exec(
pipeline,
returnStdout: true,
script: Utils.sshUserCmd(
remote,
"cat $jobWorkspace/slurm_job_id.txt"
)
"\"cat ${jobWorkspace}/slurm_job_id.txt\""
),
returnStdout: true,
numRetries: 3
).trim()
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobId}")
def scriptTrack = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
tail -f ${sbatchLogPath} &
set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
jobId=${slurmJobId}
tail -f ${slurmJobLogPath} &
tailPid=\$!
# Wait until sbatch job is done.
# Wait until Slurm job is done
while true; do
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
echo "job is still running"
# Use --allocations to ensure we match the exact job ID and not job steps (like 123.batch, 123.0)
STATUS=\$(sacct -j \$jobId --format=State -Pn --allocations)
if [[ -z \$STATUS || \$STATUS == "RUNNING" || \$STATUS == "PENDING" || \$STATUS == "CONFIGURING" ]]; then
echo "Slurm job \$jobId is still running"
sleep 300
else
echo "Job \$jobId finished with state: \$state"
echo "Slurm job \$jobId finished with state: \$STATUS"
break
fi
done
# Kill tail -f process
kill \$tailPid
# Check if the job failed or not
# Wait briefly to ensure accounting is consistent
sleep 10
# Retry getting status and exit code as sacct might be delayed
# Get exit code (STATUS is already known from loop break)
# Retry for exit code if missing
for i in {1..3}; do
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
# Use awk to parse exit code from format like "0:0"
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
if [ -n "\$EXIT_CODE" ]; then
break
fi
echo "Waiting for sacct to update... attempt \$i"
echo "Waiting for sacct exit code to update... attempt \$i"
sleep 10
done
@ -1309,11 +1330,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
EXIT_CODE=1
fi
if [ -z "\$STATUS" ]; then
echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
STATUS="UNKNOWN"
fi
# We already have valid STATUS from the loop that caused the break
if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
echo "Pytest succeed in Slurm job \$jobId"
echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
@ -1324,7 +1342,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
exit 1
fi
""".replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
Utils.exec(pipeline, script: "echo \"Script to track Slurm job and pull the log: \" && cat ${scriptTrackPathLocal}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
@ -1332,52 +1352,23 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptTrackPathNode,
true
)
def scriptStatus = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
"""
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptStatusPathLocal,
scriptStatusPathNode,
true
)
sh "cat $scriptStatusPathLocal"
while (true) {
// Check if the job is done by running sacct via SSH
def result = Utils.exec(
pipeline,
returnStdout: true,
script: Utils.sshUserCmd(
remote,
scriptStatusPathNode
)
).trim()
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
echo "Slurm job $sbatchJobId is still running, pulling the job log."
// Pulling the sbatch output log
// Track the Slurm job
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
scriptTrackPathNode
),
numRetries: 3
)
)
} else {
echo "Slurm job $sbatchJobId is done."
break
}
}
}
echo "Finished test stage execution."
}
} finally {
uploadResults(pipeline, cluster, jobUID, stageName)
stage("Clean up SLURM Resources") {
stage("Clean Up Slurm Resource") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
@ -1736,7 +1727,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
targetCloud = "kubernetes"
// DGX Spark requires a special setting for accessing the device.
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
if (type == "gb10x") {
if (type.contains("gb10x")) {
targetCloud = "nvks-sparks-cloud"
memorySize = "64Gi"
tolerations = """
@ -1755,7 +1746,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
// The following GPU types doesn't support dynamic driver flashing.
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
if (type == "gb10x") {
if (type.contains("gb10x")) {
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
@ -2595,7 +2586,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
}
// Step 2: run tests
stage ("Setup environment")
stage ("Setup Environment")
{
// Random sleep to avoid resource contention
sleep(10 * Math.random())
@ -2647,7 +2638,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
}
if (testFilter[(DEBUG_MODE)]) {
stage("Interactive debug session")
stage("Interactive Debug Session")
{
testFilter[(DEBUG_MODE)] = false
@ -2848,7 +2839,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
}
// Generate comprehensive rerun report if any reruns occurred
stage ("[${stageName}] Generate Report") {
stage ("Generate Report") {
generateRerunReport(stageName, llmSrc)
}
@ -2859,7 +2850,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
if (perfMode) {
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
stage("Check perf result") {
stage("Check Perf Result") {
def perfCheckResult = sh(
script: """
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
@ -2872,7 +2863,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
}
}
stage("Create perf report") {
stage("Create Perf Report") {
sh """
python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \
--output_path ${stageName}/report.pdf \
@ -2883,7 +2874,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
}
if (stageName.contains("PerfSanity")) {
stage ("Check perf result") {
stage ("Check PerfSanity Result") {
def perfCheckResult = sh(
script: """
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
@ -3079,7 +3070,7 @@ def ensureStageResultNotUploaded(stageName) {
if(!GlobalState.uploadResultStageNames.contains(stageName)) {
GlobalState.uploadResultStageNames.add(stageName)
} else {
stage('Upload Test Results') {
stage('Upload Test Result') {
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
error "Upload test results for ${stageName} failed because it has already been uploaded."
}
@ -3288,7 +3279,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// Perf sanity post merge test
// PerfSanity post-merge tests
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
@ -3311,8 +3302,7 @@ def launchTestJobs(pipeline, testFilter)
parallelJobs += parallelSlurmJobs
// Try to match what are being tested on x86 H100_PCIe.
// SBSA machines from the Blossom machine pool
// SBSA machines from the Blossom machine pool
SBSATestConfigs = [
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
// DGX Spark is also named as GB10 Grace Blackwell Superchip.
@ -3328,13 +3318,13 @@ def launchTestJobs(pipeline, testFilter)
// Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
// Perf sanity pre merge test
// PerfSanity pre-merge tests
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
// Perf sanity post merge test
// PerfSanity post-merge tests
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
@ -3355,9 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
// Perf sanity pre merge tests
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
// Perf sanity post merge tests
// PerfSanity post-merge tests
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
@ -3539,7 +3527,7 @@ def launchTestJobs(pipeline, testFilter)
}
if (checkPipStage) {
stage("Run LLMAPI tests") {
stage("Run LLMAPI Test") {
pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch)
trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", {
echo "###### Prerequisites Start ######"
@ -3751,8 +3739,8 @@ def launchTestJobs(pipeline, testFilter)
parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, {
stage(key) {
if (key in testFilter[REUSE_STAGE_LIST]) {
stage("Skip - reused") {
echo "Skip - Passed in the last pipeline."
stage("Skip - Reused") {
echo "Skip - Passed in the previous pipelines."
}
} else if (values instanceof List) {
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
@ -3876,7 +3864,7 @@ pipeline {
OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
}
stages {
stage("Setup environment")
stage("Setup Environment")
{
steps
{
@ -3891,7 +3879,7 @@ pipeline {
}
}
}
stage("Check Test Lists")
stage("Check Test List")
{
when {
expression {

View File

@ -12,7 +12,14 @@ slurm_install_setup() {
cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
# Use unique lock file for this job ID
lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"
if [ $SLURM_LOCALID -eq 0 ]; then
if [ -f "$lock_file" ]; then
rm -f "$lock_file"
fi
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
which python3
python3 --version
@ -27,11 +34,11 @@ slurm_install_setup() {
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
echo "(Writing install lock) Current directory: $(pwd)"
touch install_lock.lock
touch "$lock_file"
else
echo "(Waiting for install lock) Current directory: $(pwd)"
while [ ! -f install_lock.lock ]; do
sleep 5
while [ ! -f "$lock_file" ]; do
sleep 10
done
fi
}

View File

@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
if [ $SLURM_PROCID -eq 0 ]; then
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
else
# Sleep 10 seconds to wait for the coverage config file to be saved
sleep 10
# Sleep 30 seconds to wait for the coverage config file to be saved
sleep 30
fi
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
@ -108,6 +108,25 @@ eval $pytestCommand
pytest_exit_code=$?
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4)
# Remove this after the issue is resolved
if [ $pytest_exit_code -eq 4 ]; then
echo "DEBUG: Pytest failed with usage error (exit code 4)"
echo "DEBUG: Directory state at $(pwd):"
ls -l
echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:"
ls -l $llmSrcNode/tests/integration/defs
echo "DEBUG: conftest.py content:"
md5sum $llmSrcNode/tests/integration/defs/conftest.py
echo "DEBUG: pytest.ini content:"
md5sum $llmSrcNode/tests/integration/defs/pytest.ini
echo "DEBUG: Check importability of conftest.py"
python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')"
fi
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
if [[ "$stageName" == *PyTorch* ]]; then
basePerfFilename="base_perf_pytorch.csv"
@ -136,11 +155,11 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
fi
if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
echo "Check Perf-Sanity Result"
echo "Check PerfSanity Result"
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
$jobWorkspace
perf_sanity_check_exit_code=$?
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code"
fi
if [ "$pytest_exit_code" -ne 0 ]; then

View File

@ -15,6 +15,7 @@ Note:
All the perf tests will be excluded since they are generated dynamically.
"""
import argparse
import glob
import os
import subprocess
@ -42,7 +43,13 @@ def verify_l0_test_lists(llm_src):
test_list = f"{llm_src}/l0_test.txt"
# Remove dynamically generated perf tests
subprocess.run(f"rm -f {test_db_path}/*perf*", shell=True, check=True)
# Exclude perf_sanity tests from being removed since they are different and statically defined
for file_path in glob.glob(os.path.join(test_db_path, "*perf*")):
if "perf_sanity" not in os.path.basename(file_path):
try:
os.remove(file_path)
except OSError:
pass
subprocess.run(
f"trt-test-db -d {test_db_path} --test-names --output {test_list}",
shell=True,

View File

@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml():
import psutil
logger.warning(
f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
)
f"pynvml not available, using fallback commands for memory monitoring")
gpu_memory = {}
system_total_mb = 0

View File

@ -542,3 +542,4 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890)
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536)
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)