mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][ci] Some tweaks for the CI pipeline (#10359)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
afc533193d
commit
c4f27fa4c0
@ -585,7 +585,7 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
|
||||
parallelJobs.failFast = enableFailFast
|
||||
|
||||
if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) {
|
||||
def key = "Build with build type Debug"
|
||||
def key = "Build With Build Type Debug"
|
||||
parallelJobs += [
|
||||
(key): {
|
||||
script {
|
||||
@ -628,7 +628,7 @@ pipeline {
|
||||
HF_DATASETS_OFFLINE=1
|
||||
}
|
||||
stages {
|
||||
stage("BuildJob") {
|
||||
stage("Build Job") {
|
||||
steps {
|
||||
launchStages(this, params.targetArch, params.enableFailFast, globalVars)
|
||||
}
|
||||
|
||||
@ -276,7 +276,7 @@ def buildImage(config, imageKeyToTag)
|
||||
}
|
||||
|
||||
// Step 2: Build the images
|
||||
stage ("Install packages") {
|
||||
stage ("Install Package") {
|
||||
sh "pwd && ls -alh"
|
||||
sh "env | sort"
|
||||
sh "apk add make git"
|
||||
@ -380,7 +380,7 @@ def buildImage(config, imageKeyToTag)
|
||||
}
|
||||
|
||||
if (customTag) {
|
||||
stage ("custom tag: ${customTag} (${arch})") {
|
||||
stage ("Custom Tag: ${customTag} (${arch})") {
|
||||
sh """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
BASE_IMAGE=${BASE_IMAGE} \
|
||||
@ -395,7 +395,7 @@ def buildImage(config, imageKeyToTag)
|
||||
} catch (Exception ex) {
|
||||
containerGenFailure = ex
|
||||
} finally {
|
||||
stage ("Docker logout") {
|
||||
stage ("Docker Logout") {
|
||||
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
|
||||
sh "docker logout urm.nvidia.com"
|
||||
sh "docker logout ${DEFAULT_GIT_URL}:5005"
|
||||
@ -424,14 +424,14 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
|
||||
|
||||
def release_action = params.action
|
||||
def buildConfigs = [
|
||||
"Build trtllm release (x86_64)": [
|
||||
"Build Internal release (x86_64 trtllm)": [
|
||||
target: "trtllm",
|
||||
action: release_action,
|
||||
customTag: LLM_BRANCH_TAG + "-x86_64",
|
||||
build_wheel: true,
|
||||
dockerfileStage: "release",
|
||||
],
|
||||
"Build trtllm release (SBSA)": [
|
||||
"Build Internal release (SBSA trtllm)": [
|
||||
target: "trtllm",
|
||||
action: release_action,
|
||||
customTag: LLM_BRANCH_TAG + "-sbsa",
|
||||
@ -439,21 +439,21 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
|
||||
arch: "arm64",
|
||||
dockerfileStage: "release",
|
||||
],
|
||||
"Build CI image (x86_64 tritondevel)": [:],
|
||||
"Build CI image (SBSA tritondevel)": [
|
||||
"Build CI Image (x86_64 tritondevel)": [:],
|
||||
"Build CI Image (SBSA tritondevel)": [
|
||||
arch: "arm64",
|
||||
],
|
||||
"Build CI image (RockyLinux8 Python310)": [
|
||||
"Build CI Image (RockyLinux8 Python310)": [
|
||||
target: "rockylinux8",
|
||||
args: "PYTHON_VERSION=3.10.12",
|
||||
postTag: "-py310",
|
||||
],
|
||||
"Build CI image (RockyLinux8 Python312)": [
|
||||
"Build CI Image (RockyLinux8 Python312)": [
|
||||
target: "rockylinux8",
|
||||
args: "PYTHON_VERSION=3.12.3",
|
||||
postTag: "-py312",
|
||||
],
|
||||
"Build NGC devel and release (x86_64)": [
|
||||
"Build NGC devel And release (x86_64)": [
|
||||
target: "ngc-release",
|
||||
action: release_action,
|
||||
args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
|
||||
@ -464,7 +464,7 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
|
||||
],
|
||||
dockerfileStage: "release",
|
||||
],
|
||||
"Build NGC devel and release (SBSA)": [
|
||||
"Build NGC devel And release (SBSA)": [
|
||||
target: "ngc-release",
|
||||
action: release_action,
|
||||
args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
|
||||
@ -583,7 +583,7 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Upload Artifacts") {
|
||||
stage("Upload Artifact") {
|
||||
steps {
|
||||
script {
|
||||
String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
|
||||
@ -594,7 +594,7 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Wait for Build Jobs Complete") {
|
||||
stage("Wait For Build Job Complete") {
|
||||
when {
|
||||
expression {
|
||||
RUN_SANITY_CHECK
|
||||
@ -655,7 +655,7 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Sanity Check for NGC Images") {
|
||||
stage("Sanity Check For NGC Image") {
|
||||
when {
|
||||
expression {
|
||||
RUN_SANITY_CHECK
|
||||
@ -691,7 +691,7 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Register NGC Images for Security Checks") {
|
||||
stage("Register NGC Image For Security Check") {
|
||||
when {
|
||||
expression {
|
||||
return params.nspect_id && params.action == "push"
|
||||
|
||||
@ -451,7 +451,7 @@ def launchReleaseCheck(pipeline)
|
||||
}
|
||||
|
||||
def image = "urm.nvidia.com/docker/golang:1.22"
|
||||
stageName = "Release Check"
|
||||
stageName = "Release-Check"
|
||||
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
|
||||
stage("[${stageName}] Run") {
|
||||
if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
|
||||
@ -834,7 +834,7 @@ def collectTestResults(pipeline, testFilter)
|
||||
{
|
||||
collectResultPodSpec = createKubernetesPodConfig("", "agent")
|
||||
trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", {
|
||||
stage ("Collect test result") {
|
||||
stage ("Collect Test Result") {
|
||||
sh "rm -rf **/*.xml *.tar.gz"
|
||||
|
||||
testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results"
|
||||
@ -864,7 +864,7 @@ def collectTestResults(pipeline, testFilter)
|
||||
|
||||
junit(testResults: '**/results*.xml', allowEmptyResults : true)
|
||||
} // Collect test result stage
|
||||
stage("Rerun report") {
|
||||
stage("Rerun Report") {
|
||||
sh "rm -rf rerun && mkdir -p rerun"
|
||||
sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true"
|
||||
sh "find rerun -type f"
|
||||
@ -904,7 +904,7 @@ def collectTestResults(pipeline, testFilter)
|
||||
}
|
||||
} // Rerun report stage
|
||||
try {
|
||||
stage("Test coverage") {
|
||||
stage("Test Coverage") {
|
||||
sh "ls"
|
||||
def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","")
|
||||
sh "echo ${CUR_PATH}"
|
||||
@ -1030,14 +1030,15 @@ def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64"
|
||||
def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
{
|
||||
stages = [
|
||||
"Release Check": {
|
||||
"Release-Check": {
|
||||
script {
|
||||
launchReleaseCheck(this)
|
||||
}
|
||||
},
|
||||
"x86_64-linux": {
|
||||
"x86_64-Linux": {
|
||||
script {
|
||||
stage("Build") {
|
||||
def testStageName = "[Build-x86_64] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
stage(testStageName) {
|
||||
def additionalParameters = [
|
||||
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
|
||||
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
|
||||
@ -1045,7 +1046,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
]
|
||||
launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
|
||||
}
|
||||
def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
|
||||
testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
def singleGpuTestFailed = false
|
||||
stage(testStageName) {
|
||||
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
|
||||
@ -1135,24 +1137,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
}
|
||||
}
|
||||
},
|
||||
"SBSA-linux": {
|
||||
"SBSA-Linux": {
|
||||
script {
|
||||
def jenkinsUrl = ""
|
||||
def credentials = ""
|
||||
def testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
def singleGpuTestFailed = false
|
||||
|
||||
if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
|
||||
echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run"
|
||||
return
|
||||
}
|
||||
|
||||
stage("Build") {
|
||||
def testStageName = "[Build-SBSA] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
stage(testStageName) {
|
||||
def additionalParameters = [
|
||||
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
|
||||
]
|
||||
launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
|
||||
}
|
||||
|
||||
testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
def singleGpuTestFailed = false
|
||||
stage(testStageName) {
|
||||
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
|
||||
echo "SBSA test job is skipped due to Jenkins configuration"
|
||||
@ -1269,9 +1270,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||
testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||
echo "Will run Build-Docker-Images job"
|
||||
stages.remove("x86_64-linux")
|
||||
stages.remove("SBSA-linux")
|
||||
echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled."
|
||||
stages.remove("x86_64-Linux")
|
||||
stages.remove("SBSA-Linux")
|
||||
echo "Build-Docker-Images job is set explicitly. Both x86_64-Linux and SBSA-Linux sub-pipelines will be disabled."
|
||||
}
|
||||
|
||||
parallelJobs = stages.collectEntries{key, value -> [key, {
|
||||
@ -1339,11 +1340,11 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Build and Test") {
|
||||
stage("Build And Test") {
|
||||
steps {
|
||||
script {
|
||||
if (isReleaseCheckMode) {
|
||||
stage("Release Check") {
|
||||
stage("Release-Check") {
|
||||
script {
|
||||
launchReleaseCheck(this)
|
||||
}
|
||||
|
||||
@ -125,7 +125,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
|
||||
def hasTimeoutTest = false
|
||||
def downloadResultSucceed = false
|
||||
|
||||
pipeline.stage('Submit Test Results') {
|
||||
pipeline.stage('Submit Test Result') {
|
||||
sh "mkdir -p ${stageName}"
|
||||
// Download timeout test results
|
||||
def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt"
|
||||
@ -554,7 +554,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
|
||||
]
|
||||
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
stage('Request Node via SLURM') {
|
||||
stage('Request Node Via Slurm') {
|
||||
println("Selected Cluster: ${cluster.name}")
|
||||
|
||||
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)
|
||||
@ -603,7 +603,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
|
||||
}
|
||||
}
|
||||
|
||||
stage('Checking if the Node is Online') {
|
||||
stage('Check If Node Is Online') {
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
|
||||
def remote = [
|
||||
@ -696,20 +696,18 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
|
||||
}
|
||||
|
||||
slurmRunner = null
|
||||
echo "${stageName} Slurm partition timeout: ${partition.time}"
|
||||
def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
|
||||
if (cluster.containerRuntime.toString() == "DOCKER") {
|
||||
echo "${stageName} partitionTimeout: ${partition.time}"
|
||||
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
|
||||
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
|
||||
} else if (cluster.containerRuntime.toString() == "ENROOT") {
|
||||
echo "${stageName} partitionTimeout: ${partition.time}"
|
||||
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
|
||||
slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
|
||||
} else {
|
||||
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
|
||||
}
|
||||
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
|
||||
} finally {
|
||||
stage("Clean up SLURM Resources") {
|
||||
stage("Clean Up Slurm Resource") {
|
||||
// Workaround to handle the interruption during clean up SLURM resources
|
||||
retry(3) {
|
||||
try {
|
||||
@ -805,7 +803,7 @@ def getPytestBaseCommandLine(
|
||||
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
|
||||
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
|
||||
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
|
||||
"COLUMNS=400",
|
||||
"COLUMNS=300",
|
||||
extraInternalEnv,
|
||||
portEnvVars,
|
||||
pytestUtil,
|
||||
@ -893,9 +891,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
// Create a unique suffix for the job name
|
||||
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
|
||||
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
|
||||
def perfSanityMode = stageName.contains("PerfSanity")
|
||||
def disaggMode = stageName.contains("PerfSanity-Disagg")
|
||||
def setSegment = disaggMode
|
||||
|
||||
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
|
||||
|
||||
@ -933,19 +929,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
|
||||
def testListPathNode = "${jobWorkspace}/${testList}.txt"
|
||||
def waivesListPathNode = "${jobWorkspace}/waives.txt"
|
||||
def sbatchLogPath = "${jobWorkspace}/job-output.log"
|
||||
def slurmJobLogPath = "${jobWorkspace}/job-output.log"
|
||||
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
|
||||
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
|
||||
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
|
||||
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
|
||||
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
|
||||
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
|
||||
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
|
||||
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
|
||||
def isAarch64 = config.contains("aarch64")
|
||||
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
|
||||
|
||||
stage("[${stageName}] Initializing Test") {
|
||||
stage("Initialize Test") {
|
||||
println("Selected Cluster: ${cluster.name}")
|
||||
// Create Job Workspace folder in Frontend Node
|
||||
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)
|
||||
|
||||
@ -1052,7 +1046,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
// Generate Job Launch Script
|
||||
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
|
||||
def mounts = getMountListForSlurmTest(cluster, true).join(",")
|
||||
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
|
||||
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, disaggMode)
|
||||
if (taskArgs == null) {
|
||||
error "Invalid Slurm test stage name is set"
|
||||
}
|
||||
@ -1140,7 +1134,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
|
||||
def scriptLaunchPrefix = """#!/bin/bash
|
||||
#SBATCH ${exemptionComment}
|
||||
#SBATCH --output=${sbatchLogPath}
|
||||
#SBATCH --output=${slurmJobLogPath}
|
||||
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
|
||||
#SBATCH ${partition.additionalArgs}
|
||||
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
|
||||
@ -1182,8 +1176,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
|
||||
pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
|
||||
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
|
||||
Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
|
||||
Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")
|
||||
|
||||
// Output is the corresponding scriptLaunchPathLocal script under the disaggMode
|
||||
sh """
|
||||
@ -1218,8 +1210,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
scriptLaunchPathNode,
|
||||
true
|
||||
)
|
||||
|
||||
def filesToKeepWhenRetry = [
|
||||
scriptRunPathNode,
|
||||
scriptInstallPathNode,
|
||||
scriptBashUtilsPathNode,
|
||||
scriptLaunchPathNode,
|
||||
scriptSubmitPathNode,
|
||||
scriptTrackPathNode,
|
||||
testListPathNode,
|
||||
waivesListPathNode,
|
||||
coverageConfigFile
|
||||
]
|
||||
def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("")
|
||||
|
||||
def scriptSubmit = """#!/bin/bash
|
||||
set -Eeuo pipefail
|
||||
set -xEeuo pipefail
|
||||
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
|
||||
|
||||
# Clean up previous job intermediate files so that retry can work
|
||||
@ -1227,26 +1233,26 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
|
||||
echo "Found previous Slurm job ID: \${previous_job_id}"
|
||||
scancel "\${previous_job_id}" || true
|
||||
rm -rf "${jobWorkspace}/slurm_job_id.txt"
|
||||
# Wait for 60 seconds to ensure the previous job is canceled
|
||||
sleep 60
|
||||
# Wait for 120 seconds to ensure the previous job is canceled
|
||||
sleep 120
|
||||
fi
|
||||
rm -rf "${jobWorkspace}/results.xml"
|
||||
rm -rf "${jobWorkspace}/report.csv"
|
||||
rm -rf "${jobWorkspace}/unfinished_test.txt"
|
||||
rm -rf "${sbatchLogPath}"
|
||||
|
||||
touch ${sbatchLogPath}
|
||||
# Clean up workspace: remove all files/dirs not in the keep list
|
||||
find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} +
|
||||
|
||||
touch ${slurmJobLogPath}
|
||||
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
|
||||
if [ -z "\$jobId" ]; then
|
||||
echo "Error: Slurm job submission failed, no job ID returned."
|
||||
exit 1
|
||||
fi
|
||||
echo "Submitted Slurm job \$jobId"
|
||||
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
|
||||
echo \$jobId > $jobWorkspace/slurm_job_id.txt
|
||||
# Save Slurm job ID for later steps to retrieve
|
||||
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
|
||||
""".replaceAll("(?m)^\\s*", "").trim()
|
||||
|
||||
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
|
||||
Utils.exec(pipeline, script: "echo \"Script to submit the final Slurm job: \" && cat ${scriptSubmitPathLocal}")
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
@ -1255,8 +1261,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
stage("[${stageName}] Run Pytest") {
|
||||
// Submit the sbatch job
|
||||
// Submit the Slurm job
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
@ -1266,42 +1273,56 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
),
|
||||
numRetries: 3
|
||||
)
|
||||
def sbatchJobId = Utils.exec(
|
||||
|
||||
def slurmJobId = Utils.exec(
|
||||
pipeline,
|
||||
returnStdout: true,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"cat $jobWorkspace/slurm_job_id.txt"
|
||||
)
|
||||
"\"cat ${jobWorkspace}/slurm_job_id.txt\""
|
||||
),
|
||||
returnStdout: true,
|
||||
numRetries: 3
|
||||
).trim()
|
||||
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobId}")
|
||||
|
||||
def scriptTrack = """#!/bin/bash
|
||||
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
|
||||
tail -f ${sbatchLogPath} &
|
||||
set -xEeuo pipefail
|
||||
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
|
||||
|
||||
jobId=${slurmJobId}
|
||||
tail -f ${slurmJobLogPath} &
|
||||
tailPid=\$!
|
||||
# Wait until sbatch job is done.
|
||||
|
||||
# Wait until Slurm job is done
|
||||
while true; do
|
||||
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
|
||||
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
|
||||
echo "job is still running"
|
||||
# Use --allocations to ensure we match the exact job ID and not job steps (like 123.batch, 123.0)
|
||||
STATUS=\$(sacct -j \$jobId --format=State -Pn --allocations)
|
||||
|
||||
if [[ -z \$STATUS || \$STATUS == "RUNNING" || \$STATUS == "PENDING" || \$STATUS == "CONFIGURING" ]]; then
|
||||
echo "Slurm job \$jobId is still running"
|
||||
sleep 300
|
||||
else
|
||||
echo "Job \$jobId finished with state: \$state"
|
||||
echo "Slurm job \$jobId finished with state: \$STATUS"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Kill tail -f process
|
||||
kill \$tailPid
|
||||
# Check if the job failed or not
|
||||
|
||||
# Wait briefly to ensure accounting is consistent
|
||||
sleep 10
|
||||
# Retry getting status and exit code as sacct might be delayed
|
||||
|
||||
# Get exit code (STATUS is already known from loop break)
|
||||
# Retry for exit code if missing
|
||||
for i in {1..3}; do
|
||||
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
|
||||
# Use awk to parse exit code from format like "0:0"
|
||||
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
|
||||
|
||||
if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
|
||||
if [ -n "\$EXIT_CODE" ]; then
|
||||
break
|
||||
fi
|
||||
echo "Waiting for sacct to update... attempt \$i"
|
||||
echo "Waiting for sacct exit code to update... attempt \$i"
|
||||
sleep 10
|
||||
done
|
||||
|
||||
@ -1309,11 +1330,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
|
||||
EXIT_CODE=1
|
||||
fi
|
||||
if [ -z "\$STATUS" ]; then
|
||||
echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
|
||||
STATUS="UNKNOWN"
|
||||
fi
|
||||
|
||||
# We already have valid STATUS from the loop that caused the break
|
||||
if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
|
||||
echo "Pytest succeed in Slurm job \$jobId"
|
||||
echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
|
||||
@ -1324,7 +1342,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
exit 1
|
||||
fi
|
||||
""".replaceAll("(?m)^\\s*", "").trim()
|
||||
|
||||
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
|
||||
Utils.exec(pipeline, script: "echo \"Script to track Slurm job and pull the log: \" && cat ${scriptTrackPathLocal}")
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
@ -1332,52 +1352,23 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
scriptTrackPathNode,
|
||||
true
|
||||
)
|
||||
def scriptStatus = """#!/bin/bash
|
||||
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
|
||||
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
|
||||
"""
|
||||
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
scriptStatusPathLocal,
|
||||
scriptStatusPathNode,
|
||||
true
|
||||
)
|
||||
|
||||
sh "cat $scriptStatusPathLocal"
|
||||
while (true) {
|
||||
// Check if the job is done by running sacct via SSH
|
||||
def result = Utils.exec(
|
||||
pipeline,
|
||||
returnStdout: true,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
scriptStatusPathNode
|
||||
)
|
||||
).trim()
|
||||
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
|
||||
echo "Slurm job $sbatchJobId is still running, pulling the job log."
|
||||
// Pulling the sbatch output log
|
||||
// Track the Slurm job
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
scriptTrackPathNode
|
||||
),
|
||||
numRetries: 3
|
||||
)
|
||||
)
|
||||
} else {
|
||||
echo "Slurm job $sbatchJobId is done."
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
echo "Finished test stage execution."
|
||||
}
|
||||
} finally {
|
||||
uploadResults(pipeline, cluster, jobUID, stageName)
|
||||
stage("Clean up SLURM Resources") {
|
||||
stage("Clean Up Slurm Resource") {
|
||||
// Workaround to handle the interruption during clean up SLURM resources
|
||||
retry(3) {
|
||||
try {
|
||||
@ -1736,7 +1727,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
|
||||
targetCloud = "kubernetes"
|
||||
// DGX Spark requires a special setting for accessing the device.
|
||||
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
|
||||
if (type == "gb10x") {
|
||||
if (type.contains("gb10x")) {
|
||||
targetCloud = "nvks-sparks-cloud"
|
||||
memorySize = "64Gi"
|
||||
tolerations = """
|
||||
@ -1755,7 +1746,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
|
||||
|
||||
// The following GPU types doesn't support dynamic driver flashing.
|
||||
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
|
||||
if (type == "gb10x") {
|
||||
if (type.contains("gb10x")) {
|
||||
selectors = """
|
||||
kubernetes.io/arch: ${arch}
|
||||
kubernetes.io/os: linux
|
||||
@ -2595,7 +2586,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
}
|
||||
|
||||
// Step 2: run tests
|
||||
stage ("Setup environment")
|
||||
stage ("Setup Environment")
|
||||
{
|
||||
// Random sleep to avoid resource contention
|
||||
sleep(10 * Math.random())
|
||||
@ -2647,7 +2638,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
}
|
||||
|
||||
if (testFilter[(DEBUG_MODE)]) {
|
||||
stage("Interactive debug session")
|
||||
stage("Interactive Debug Session")
|
||||
{
|
||||
testFilter[(DEBUG_MODE)] = false
|
||||
|
||||
@ -2848,7 +2839,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
}
|
||||
|
||||
// Generate comprehensive rerun report if any reruns occurred
|
||||
stage ("[${stageName}] Generate Report") {
|
||||
stage ("Generate Report") {
|
||||
generateRerunReport(stageName, llmSrc)
|
||||
}
|
||||
|
||||
@ -2859,7 +2850,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
if (perfMode) {
|
||||
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
|
||||
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
|
||||
stage("Check perf result") {
|
||||
stage("Check Perf Result") {
|
||||
def perfCheckResult = sh(
|
||||
script: """
|
||||
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
|
||||
@ -2872,7 +2863,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
|
||||
}
|
||||
}
|
||||
stage("Create perf report") {
|
||||
stage("Create Perf Report") {
|
||||
sh """
|
||||
python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \
|
||||
--output_path ${stageName}/report.pdf \
|
||||
@ -2883,7 +2874,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
}
|
||||
|
||||
if (stageName.contains("PerfSanity")) {
|
||||
stage ("Check perf result") {
|
||||
stage ("Check PerfSanity Result") {
|
||||
def perfCheckResult = sh(
|
||||
script: """
|
||||
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
|
||||
@ -3079,7 +3070,7 @@ def ensureStageResultNotUploaded(stageName) {
|
||||
if(!GlobalState.uploadResultStageNames.contains(stageName)) {
|
||||
GlobalState.uploadResultStageNames.add(stageName)
|
||||
} else {
|
||||
stage('Upload Test Results') {
|
||||
stage('Upload Test Result') {
|
||||
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
|
||||
error "Upload test results for ${stageName} failed because it has already been uploaded."
|
||||
}
|
||||
@ -3288,7 +3279,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
|
||||
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
|
||||
// Perf sanity post merge test
|
||||
// PerfSanity post-merge tests
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
|
||||
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
|
||||
@ -3311,8 +3302,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
|
||||
parallelJobs += parallelSlurmJobs
|
||||
|
||||
// Try to match what are being tested on x86 H100_PCIe.
|
||||
// SBSA machines from the Blossom machine pool
|
||||
// SBSA machines from the Blossom machine pool
|
||||
SBSATestConfigs = [
|
||||
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
|
||||
// DGX Spark is also named as GB10 Grace Blackwell Superchip.
|
||||
@ -3328,13 +3318,13 @@ def launchTestJobs(pipeline, testFilter)
|
||||
// Disable GB300 stages due to nodes will be offline temporarily.
|
||||
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
|
||||
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
|
||||
// Perf sanity pre merge test
|
||||
// PerfSanity pre-merge tests
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
|
||||
// Perf sanity post merge test
|
||||
// PerfSanity post-merge tests
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
|
||||
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
|
||||
@ -3355,9 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
|
||||
// Perf sanity pre merge tests
|
||||
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
|
||||
// Perf sanity post merge tests
|
||||
// PerfSanity post-merge tests
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
|
||||
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
|
||||
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
|
||||
@ -3539,7 +3527,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
}
|
||||
|
||||
if (checkPipStage) {
|
||||
stage("Run LLMAPI tests") {
|
||||
stage("Run LLMAPI Test") {
|
||||
pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch)
|
||||
trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", {
|
||||
echo "###### Prerequisites Start ######"
|
||||
@ -3751,8 +3739,8 @@ def launchTestJobs(pipeline, testFilter)
|
||||
parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, {
|
||||
stage(key) {
|
||||
if (key in testFilter[REUSE_STAGE_LIST]) {
|
||||
stage("Skip - reused") {
|
||||
echo "Skip - Passed in the last pipeline."
|
||||
stage("Skip - Reused") {
|
||||
echo "Skip - Passed in the previous pipelines."
|
||||
}
|
||||
} else if (values instanceof List) {
|
||||
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
|
||||
@ -3876,7 +3864,7 @@ pipeline {
|
||||
OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
|
||||
}
|
||||
stages {
|
||||
stage("Setup environment")
|
||||
stage("Setup Environment")
|
||||
{
|
||||
steps
|
||||
{
|
||||
@ -3891,7 +3879,7 @@ pipeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
stage("Check Test Lists")
|
||||
stage("Check Test List")
|
||||
{
|
||||
when {
|
||||
expression {
|
||||
|
||||
@ -12,7 +12,14 @@ slurm_install_setup() {
|
||||
cd $resourcePathNode
|
||||
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
|
||||
|
||||
# Use unique lock file for this job ID
|
||||
lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"
|
||||
|
||||
if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
if [ -f "$lock_file" ]; then
|
||||
rm -f "$lock_file"
|
||||
fi
|
||||
|
||||
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
|
||||
which python3
|
||||
python3 --version
|
||||
@ -27,11 +34,11 @@ slurm_install_setup() {
|
||||
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
|
||||
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
|
||||
echo "(Writing install lock) Current directory: $(pwd)"
|
||||
touch install_lock.lock
|
||||
touch "$lock_file"
|
||||
else
|
||||
echo "(Waiting for install lock) Current directory: $(pwd)"
|
||||
while [ ! -f install_lock.lock ]; do
|
||||
sleep 5
|
||||
while [ ! -f "$lock_file" ]; do
|
||||
sleep 10
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
|
||||
if [ $SLURM_PROCID -eq 0 ]; then
|
||||
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
|
||||
else
|
||||
# Sleep 10 seconds to wait for the coverage config file to be saved
|
||||
sleep 10
|
||||
# Sleep 30 seconds to wait for the coverage config file to be saved
|
||||
sleep 30
|
||||
fi
|
||||
|
||||
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
|
||||
@ -108,6 +108,25 @@ eval $pytestCommand
|
||||
pytest_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
|
||||
|
||||
# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4)
|
||||
# Remove this after the issue is resolved
|
||||
if [ $pytest_exit_code -eq 4 ]; then
|
||||
echo "DEBUG: Pytest failed with usage error (exit code 4)"
|
||||
echo "DEBUG: Directory state at $(pwd):"
|
||||
ls -l
|
||||
echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:"
|
||||
ls -l $llmSrcNode/tests/integration/defs
|
||||
|
||||
echo "DEBUG: conftest.py content:"
|
||||
md5sum $llmSrcNode/tests/integration/defs/conftest.py
|
||||
|
||||
echo "DEBUG: pytest.ini content:"
|
||||
md5sum $llmSrcNode/tests/integration/defs/pytest.ini
|
||||
|
||||
echo "DEBUG: Check importability of conftest.py"
|
||||
python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')"
|
||||
fi
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
|
||||
if [[ "$stageName" == *PyTorch* ]]; then
|
||||
basePerfFilename="base_perf_pytorch.csv"
|
||||
@ -136,11 +155,11 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
|
||||
fi
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
|
||||
echo "Check Perf-Sanity Result"
|
||||
echo "Check PerfSanity Result"
|
||||
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
|
||||
$jobWorkspace
|
||||
perf_sanity_check_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
|
||||
echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ "$pytest_exit_code" -ne 0 ]; then
|
||||
|
||||
@ -15,6 +15,7 @@ Note:
|
||||
All the perf tests will be excluded since they are generated dynamically.
|
||||
"""
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
@ -42,7 +43,13 @@ def verify_l0_test_lists(llm_src):
|
||||
test_list = f"{llm_src}/l0_test.txt"
|
||||
|
||||
# Remove dynamically generated perf tests
|
||||
subprocess.run(f"rm -f {test_db_path}/*perf*", shell=True, check=True)
|
||||
# Exclude perf_sanity tests from being removed since they are different and statically defined
|
||||
for file_path in glob.glob(os.path.join(test_db_path, "*perf*")):
|
||||
if "perf_sanity" not in os.path.basename(file_path):
|
||||
try:
|
||||
os.remove(file_path)
|
||||
except OSError:
|
||||
pass
|
||||
subprocess.run(
|
||||
f"trt-test-db -d {test_db_path} --test-names --output {test_list}",
|
||||
shell=True,
|
||||
|
||||
@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml():
|
||||
import psutil
|
||||
|
||||
logger.warning(
|
||||
f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
|
||||
)
|
||||
f"pynvml not available, using fallback commands for memory monitoring")
|
||||
|
||||
gpu_memory = {}
|
||||
system_total_mb = 0
|
||||
|
||||
@ -542,3 +542,4 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536)
|
||||
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user